diff --git a/CMakeLists.txt b/CMakeLists.txt
index 68447727118a91a2a8c0d06404353c7ccb734c6d..317f7f9eb46a96e9f6ea393abf82d608af50fc4b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,12 +138,6 @@ else()
     set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
 
-if(WITH_MKL)
-  option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
-  if (MKL_SPLIT_GEMM)
-    add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
-  endif()
-endif()
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
     if (WITH_MKL AND AVX2_FOUND)
@@ -204,6 +198,11 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 
+include(flags)              # set paddle compile flags
+include(cudnn)              # set cudnn libraries, must before configure
+include(cupti)
+include(configure)          # add paddle env configuration
+
 if(WITH_GPU)
     include(cuda)
     include(tensorrt)
@@ -212,15 +211,11 @@ elseif()
     set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
 endif()
 
-include(cudnn)              # set cudnn libraries, must before configure
-include(cupti)
-include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(rdma)               # set rdma libraries
-include(flags)              # set paddle compile flags
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
 include(inference_lib)      # add paddle fluid inference libraries
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index d14162e0a662afe63152bfc2132e5dfd54f5a86c..e03e15bfc017ce33e06192a7fa8010ffe060adcb 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -50,16 +50,16 @@ if(NOT WITH_PROFILER)
 endif(NOT WITH_PROFILER)
 
 if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX_FOUND)
+    if(WITH_AVX AND AVX512F_FOUND)
+        set(SIMD_FLAG ${AVX512F_FLAG})
+    elseif(WITH_AVX AND AVX2_FOUND)
+        set(SIMD_FLAG ${AVX2_FLAG})
+    elseif(WITH_AVX AND AVX_FOUND)
         set(SIMD_FLAG ${AVX_FLAG})
     elseif(SSE3_FOUND)
         set(SIMD_FLAG ${SSE3_FLAG})
     endif()
 endif()
-if(UNIX AND NOT APPLE)
-  # except apple from nix*Os family
-  set(LINUX TRUE)
-endif(UNIX AND NOT APPLE)
 
 if(NOT WITH_GOLANG)
     add_definitions(-DPADDLE_WITHOUT_GOLANG)
@@ -103,15 +103,20 @@ if(WITH_GPU)
     endif()
     if(WITH_ANAKIN)
         if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
-            message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile")
+            message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force WITH_ANAKIN=OFF")
+            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE)
         endif()
         if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-            message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
+            message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
+            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
         endif()
-        set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
-        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY})
-        message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h")
-        message(STATUS "cudnn library is ${CUDNN_LIBRARY}")
+    endif()
+    if(WITH_ANAKIN)
+        # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
+        # is a softlink to real cudnn.h directory
+        set(ENV{CUDNN_INCLUDE_DIR} "${CUDNN_INCLUDE_DIR}/")
+        get_filename_component(CUDNN_LIBRARY_DIR ${CUDNN_LIBRARY} DIRECTORY)
+        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY_DIR})
     endif()
 elseif(WITH_AMD_GPU)
     add_definitions(-DPADDLE_WITH_HIP)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 9eebea816cbfc91052c95ecf99ecc4b0bea4e4c2..cd51533926de7bb132ab7bfab1686d664a331410 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -25,8 +25,25 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
-    /usr/lib)
-find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
+    /usr/lib
+	${CUDA_TOOLKIT_ROOT_DIR}
+	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+	)
+set(CUDNN_LIB_NAME "")
+if (LINUX)
+set(CUDNN_LIB_NAME "libcudnn.so")
+endif(LINUX)
+
+if(WIN32)
+# only support cudnn7
+set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
+endif(WIN32)
+
+if(Apple)
+set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
+endif(Apple)
+
+find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
     PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
           NO_DEFAULT_PATH
     DOC "Path to cuDNN library.")
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index 855897394a602736f3a4fa1132c634f8cb537468..78be0749091fb0a617f9fb172cc92b33560a3552 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -2,6 +2,11 @@ if (NOT WITH_ANAKIN)
   return()
 endif()
 
+option(ANAKIN_ENABLE_OP_TIMER      "Get more detailed information with Anakin op time"        OFF)
+if(ANAKIN_ENABLE_OP_TIMER)
+  add_definitions(-DPADDLE_ANAKIN_ENABLE_OP_TIMER)
+endif()
+
 INCLUDE(ExternalProject)
 set(ANAKIN_SOURCE_DIR  ${THIRD_PARTY_PATH}/anakin)
 # the anakin install dir is only default one now
@@ -11,25 +16,36 @@ set(ANAKIN_LIBRARY     ${ANAKIN_INSTALL_DIR})
 set(ANAKIN_SHARED_LIB  ${ANAKIN_LIBRARY}/libanakin.so)
 set(ANAKIN_SABER_LIB   ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
 
-# TODO(luotao): ANAKIN_MODLE_URL will move to demo ci later.
-set(ANAKIN_MODLE_URL "http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2.anakin.bin")
+# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
+set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
+set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
+set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
+set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
 execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL}")
+execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
+execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
+execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
 
 include_directories(${ANAKIN_INCLUDE})
 include_directories(${ANAKIN_INCLUDE}/saber/)
+include_directories(${ANAKIN_INCLUDE}/saber/core/)
+include_directories(${ANAKIN_INCLUDE}/saber/funcs/impl/x86/)
+include_directories(${ANAKIN_INCLUDE}/saber/funcs/impl/cuda/base/cuda_c/)
 
-set(ANAKIN_COMPILE_EXTRA_FLAGS 
+set(ANAKIN_COMPILE_EXTRA_FLAGS
     -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
-    -Wno-error=unused-variable -Wno-unused-variable 
+    -Wno-error=unused-variable -Wno-unused-variable
     -Wno-error=format-extra-args -Wno-format-extra-args
     -Wno-error=comment -Wno-comment 
     -Wno-error=format -Wno-format 
+    -Wno-error=maybe-uninitialized -Wno-maybe-uninitialized
     -Wno-error=switch -Wno-switch
-    -Wno-error=return-type -Wno-return-type 
+    -Wno-error=return-type -Wno-return-type
     -Wno-error=non-virtual-dtor -Wno-non-virtual-dtor
+    -Wno-error=ignored-qualifiers
+    -Wno-ignored-qualifiers
     -Wno-sign-compare
-    -Wno-reorder 
+    -Wno-reorder
     -Wno-error=cpp)
 
 ExternalProject_Add(
@@ -38,7 +54,7 @@ ExternalProject_Add(
     DEPENDS             ${MKLML_PROJECT}
     # Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code.
     GIT_REPOSITORY      "https://github.com/luotao1/Anakin"
-    GIT_TAG             "bcf17aabe7921ceb7bce591244b4f9dce7dba5c8"
+    GIT_TAG             "211d1fc5d813d70c0c14072f9083cf25f40940ea"
     PREFIX              ${ANAKIN_SOURCE_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DUSE_GPU_PLACE=YES
@@ -47,6 +63,8 @@ ExternalProject_Add(
                         -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                         -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
                         -DCUDNN_ROOT=${CUDNN_ROOT}
+                        -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
+                        -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
                         ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
 )
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 1120677a37e0d44163816b66600121c8f0d545af..e0556a0babc74ba6efa0a190d4f7b77416bef3bf 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -102,7 +102,6 @@ set(COMMON_FLAGS
     -fno-omit-frame-pointer
     -Wall
     -Wextra
-    -Werror
     -Wnon-virtual-dtor
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
@@ -115,6 +114,11 @@ set(COMMON_FLAGS
     -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )
 
+# https://github.com/PaddlePaddle/Paddle/issues/12773
+if (NOT WIN32)
+list(APPEND COMMON_FLAGS -Werror)
+endif()
+
 set(GPU_COMMON_FLAGS
     -fPIC
     -fno-omit-frame-pointer
@@ -142,6 +146,11 @@ else()
         ${GPU_COMMON_FLAGS})
 endif()
 
+if(UNIX AND NOT APPLE)
+  # except apple from nix*Os family
+  set(LINUX TRUE)
+endif(UNIX AND NOT APPLE)
+
 
 foreach(flag ${COMMON_FLAGS})
     safe_set_cflag(CMAKE_C_FLAGS ${flag})
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 53c2de332ea74b06d1bd6e5bb119cad6af27ed01..3eacf4d86aa0385eddb690d72e85e3384929bb99 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID
     set(SSE3_FLAG "-msse3")
     set(AVX_FLAG "-mavx")
     set(AVX2_FLAG "-mavx2")
+    set(AVX512F_FLAG "-mavx512f")
 elseif(MSVC)
     set(MMX_FLAG "/arch:MMX")
     set(SSE2_FLAG "/arch:SSE2")
@@ -81,5 +82,16 @@ int main()
     return 0;
 }" AVX2_FOUND)
 
+# Check AVX512F
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m512i a = _mm512_undefined_epi32();
+    return 0;
+}" AVX512F_FOUND)
+
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
+mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/doc/fluid/design/dist_train/dist_train_nccl2.md b/doc/fluid/design/dist_train/dist_train_nccl2.md
index aa7455ec5de0d46d7c2b0cef3b7ebf4754af3cb1..b8b8427811cddcddf872db5badfd37c96a76c3e3 100644
--- a/doc/fluid/design/dist_train/dist_train_nccl2.md
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@@ -1,7 +1,7 @@
 # Distributed Training with NCCL2
 
 We design a pattern that can enable training with `ParallelExecutor` and
-using [NCCL2](https://developer.nvidia.com/nccl) as it's collective
+use [NCCL2](https://developer.nvidia.com/nccl) as it's collective
 communication library.
 
 In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
@@ -9,14 +9,14 @@ to do multi GPU training. And if we initialize NCCL2 communicators as
 ranks in a distributed environment, we can simply run the `ParallelExecutor`
 as a distributed program! The only thing that may be different than in
 the single node version is that we need to broadcast the NCCL unique ID
-to all the nodes, and initialize communicators using that ID, so NCCL2
-will know each other as ranks.
+to all the nodes and initialize communicators using that ID, so NCCL2
+can know each other as ranks.
 
 To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
 so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
-what ever platform you like.
+whatever platform you like.
 
-It have two running modes:
+It has two running modes:
 
 1. Generate and broadcast mode, which should be used on trainer 0;
 1. Listen and fetch mode, which should be used on trainers other than 0.
@@ -29,7 +29,7 @@ initialize NCCL communicator objects.
 <img src="src/ncc2_design.png">
 
 The above figure indicates the general process when training with NCCL2
-distributed. Each trainer have the number of communicators equal to the
+distributed. Each trainer has the number of communicators equal to the
 number of GPUs, but the ranks should match the global ranks number: here
 we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
 be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
diff --git a/doc/fluid/design/others/graph_survey.md b/doc/fluid/design/others/graph_survey.md
index 6c6db08f463ae0a2b94fc4546f123a1d7c151870..97f395133b48a1d0ed5136f0ebc8720b8ca87ded 100644
--- a/doc/fluid/design/others/graph_survey.md
+++ b/doc/fluid/design/others/graph_survey.md
@@ -28,7 +28,7 @@ def get_symbol(num_classes=10, **kwargs):
 
 
 
-Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null.
+Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null.
 
 Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
 
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 587d819f79fcf82549826359fbf04ad3af404446..ff7408111fa20a7a6a3a2fe9f9ba20835918f399 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -36,19 +36,19 @@
 <tbody>
 <tr>
 <td>OpProtoMake定义 </td>
-<td>`.cc`文件，Backward Op不需要定义OpProtoMake </td>
+<td>.cc 文件，Backward Op不需要定义OpProtoMake </td>
 </tr>
 <tr>
 <td>Op定义 </td>
-<td> `.cc`文件</td>
+<td> .cc 文件</td>
 </tr>
 <tr>
 <td>Kernel实现 </td>
-<td> CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。</td>
+<td> CPU、CUDA共享Kernel实现在.h 文件中，否则，CPU 实现在.cc 文件中，CUDA 实现在.cu 文件中。</td>
 </tr>
 <tr>
 <td>注册Op </td>
-<td> Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中</td>
+<td> Op注册实现在.cc 文件；Kernel注册CPU实现在.cc 文件中，CUDA实现在.cu 文件中</td>
 </tr>
 </tbody>
 </table>
@@ -119,10 +119,29 @@ $$Out = scale*X$$
 
 这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 
+### 定义GradProtoMaker类
+每个Op的必须有一个对应的GraProtoMaker，若未定制对应前向Op的GradProtoMaker，fluid提供了DefaultGradProtoMaker，默认注册会使用全部输入输出，包括Input, Output, Output@Grad等，使用不需要的变量的会造成显存浪费。
+下面示例定义了ScaleOp的GradProtoMaker。
+
+```cpp
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", GetAttr("scale"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+```
 
 ### 定义Operator类
 
-下面的点实现了MulOp的定义：
+下面实现了MulOp的定义：
 
 ```cpp
 class MulOp : public framework::OperatorWithKernel {
@@ -334,3 +353,83 @@ ctest -R test_mul_op
 - 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OPERATOR(B, ...)`等，这将会导致单元测试出错。
 - 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+### PADDLE_ENFORCE使用注意
+
+实现Op时检查数据的合法性需要使用PADDLE_ENFORCE以及PADDLE_ENFORCE_EQ等宏定义，基本格式如下：
+
+```
+PADDLE_ENFORCE(表达式, 错误提示信息)
+PADDLE_ENFORCE_EQ(比较对象A, 比较对象B, 错误提示信息)
+```
+
+如果表达式为真，或者比较对象A=B，则检查通过，否则会终止程序运行，向用户反馈相应的错误提示信息。
+为了确保提示友好易懂，开发者需要注意其使用方法。
+
+#### 总体原则
+
+任何使用了PADDLE_ENFORCE与PADDLE_ENFORCE_**检查的地方，必须有详略得当的备注解释！**错误提示信息**不能为空！
+
+#### 提示信息书写标准
+
+1. [required] 哪里错了？为什么错了？
+    - 例如：`ValueError: Mismatched label shape`
+2. [optional] 期望的输入是什么样的？实际的输入是怎样的？
+    - 例如：`Expected labels dimension=1. Received 4.`
+3. [optional] 能否给出修改意见？
+    - 例如：`Suggested Fix:If your classifier expects one-hot encoding label,check your n_classes argument to the estimatorand/or the shape of your label.Otherwise, check the shape of your label.`
+
+如果并非必要或者简洁的描述即可表达清楚以上要点，根据情况书写亦可。
+
+##### FAQ 典型问题
+
+1. 无报错信息或报错信息过于简单，不能给用户提供有效的提示！
+
+问题示例1 ：未写提示信息
+```
+PADDLE_ENFORCE(ctx->HasInput("X"), "");
+```
+问题示例2 ：提示信息过于简单
+```
+PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么？
+```
+
+2. 在报错信息中使用开发人员定义的变量缩写，不易理解！
+
+问题示例：
+```
+PADDLE_ENFORCE(forward_pd != nullptr,
+                    "Fail to find eltwise_fwd_pd in device context");  //eltwise_fwd_pd用户可能看不懂
+```
+
+3. OP内部调用非法接口：Op内部如果出现Output = ShareDataWith(Input) 
+问题示例：
+```cpp
+auto *out = ctx.Output<framework::LoDTensor>("Out");
+auto *in = ctx.Input<framework::LoDTensor>("X");
+out->ShareDataWith(*in);
+```
+Op内部如果出现Output = ShareDataWith(Input)，相当于operator图的中有一条隐藏边，连接了Input和Output，这条边无法在图分析中表达，引发基于图优化的错误。
+
+4. OP实现的性能实践
+调用了eigen的broadcast, chop等操作，性能会比手写cuda kernel差几倍以上。此时cpu的实现可以复用eigen，gpu实现可以实现cuda kernel.
+
+
+#### OP InferShape检查提示信息特别说明
+
+- 检查输入输出变量，请统一遵循以下格式
+`Input(变量名) of OP名 operator should not be null.`  
+
+正确示例：
+```
+PADDLE_ENFORCE(ctx->HasInput("Input"),
+                        "Input(Input) of LSTMP operator should not be null.");
+```
+
+- 反向Op的输入输出检查，要写明反向Op的名字
+
+正确示例：
+```
+PADDLE_ENFORCE(ctx->HasInput("X"),
+                        "Input(X) of LoDResetGrad opreator should not be null.");
+```
diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md
index 75922e7d85a13e53ce94619a48d8da8b960e6c9a..56203d6fad444f61ef1be187ad0d149b2aa99ba4 100644
--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
@@ -7,7 +7,7 @@
 
 Eigen Tensor模块对element-wise计算提供了强大的支持，并且书写一份代码，可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块，因此可能测试不够完备，文档较少。
 
-关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
+关于Eigen Tensor模块的详细介绍请参考[Eigen文档](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
 
 
 ## paddle::framework::Tensor
diff --git a/doc/fluid/howto/cluster/nccl2_rdma_training.md b/doc/fluid/howto/cluster/nccl2_rdma_training.md
index cecd5c3a7a7339e3be6772543a534728ec132105..8adaf324fccb4cda7af16b9bace559c0642ae444 100644
--- a/doc/fluid/howto/cluster/nccl2_rdma_training.md
+++ b/doc/fluid/howto/cluster/nccl2_rdma_training.md
@@ -1,12 +1,12 @@
 # Distributed Training with NCCL2 and RDMA
 
-When doing distributed multi-GPU training, network bandwith often becomes the
-bottle neck. We introduce a way to use NCCL2 to do such training job to
-achieve best performace.
+When doing distributed multi-GPU training, network bandwidth often becomes the
+bottleneck. We introduce a way to use NCCL2 to do such training job to
+achieve best performance.
 
-## Prepare Hardwares with RDMA and Multiple GPUs
+## Prepare Hardware with RDMA and Multiple GPUs
 
-I'm using two Linux servers each of them is installed with 8 GPUs and
+I'm using two Linux servers each of them installed with 8 GPUs and
 one 100Gb RDMA card.
 Base environment is:
 
@@ -25,7 +25,7 @@ In general, the steps including:
 1. Use docker to run tests and make sure GPUs and RDMA can work inside
    the container.
 
-I'll ommit section "Install GPU drivers" because we can find it easily
+I'll omit the section "Install GPU drivers" because we can find it easily
 somewhere else.
 
 ### Install RDMA drivers
@@ -33,7 +33,7 @@ somewhere else.
 For my case, I've got two machines with device
 "Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was
 "CentOS 7.4" and I updated the kernel to version 4.4 so that docker can
-work with latest overlay2 filesystem.
+work with the latest overlay2 filesystem.
 
 ***NOTE: before you start, make sure you have a way to get a console
 of the server other than ssh because we may need to re-configure the
@@ -45,14 +45,14 @@ network device.***
 1. Run `./mlnxofedinstall --add-kernel-support` in the software package.
 1. Run `/etc/init.d/openibd restart` to make everything work, note that
    this operation may cause the network goes down if you are using this
-   RDMA device as default network device and use ssh to login the server.
+   RDMA device as default network device and use ssh to log in the server.
 1. Re-configure the network interface, for example:
    `ifconfig eth2 192.168.16.30/20 up`, then add routes if needed:
    `ip route add default via 192.168.16.1 dev eth2`.
 1. Do the same thing on the other node.
 1. Use `ping` to test if the two nodes have typical ICMP connection.
 1. Use either `udaddy` or `ib_write_bw` to test the network connection is
-   ready and have the desired bandwith.
+   ready and have the desired bandwidth.
 
 ### Prepare Docker Image to Run RDMA Programs
 
@@ -60,7 +60,7 @@ network device.***
    package in it.
 1. Start a docker container and mount GPU driver libs into it (you can
    skip this step if you are using nvidia-docker).
-1. Mount RDMA dirvers and libs into the docker image (see below section),
+1. Mount RDMA drivers and libs into the docker image (see below section),
    also `udaddy` and `ib_write_bw` if needed.
 1. Mount GPU devices and RDMA devices into the container using `--device`
    or just use privileged mode `--privileged`.
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index c2a08d2e53c3f637eedafb5eca1192d4e60e7a84..6857b823e94f965b92d1f5636ca2b48f48506138 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -78,7 +78,7 @@ paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', '
 paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
-paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
 paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
@@ -153,6 +153,7 @@ paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'n
 paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
 paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -162,6 +163,7 @@ paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], v
 paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@@ -191,7 +193,7 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
 paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
@@ -251,7 +253,6 @@ paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwarg
 paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 1d62792b80dd002b894da28be9162fc7d3ce054e..2ec422cc17faf7f6b99ac70b5f175881bf017566 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -99,8 +99,13 @@ else()
   cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
 endif()
 
-
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass)
+if (NOT WIN32)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS
+        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
+        graph graph_viz_pass multi_devices_graph_pass
+        multi_devices_graph_print_pass multi_devices_graph_check_pass
+        fast_threaded_ssa_graph_executor)
+endif() # NOT WIN32
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
@@ -115,6 +120,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 
+cc_test(rw_lock_test SRCS rw_lock_test.cc)
+
 # disable test temporarily.
 # TODO https://github.com/PaddlePaddle/Paddle/issues/11971
 # cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h
new file mode 100644
index 0000000000000000000000000000000000000000..be9efcd74924a2050a2fd9ab83059590a1a2a2fd
--- /dev/null
+++ b/paddle/fluid/framework/array.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace framework {
+template <typename T, size_t N>
+class Array {
+  static_assert(N > 0, "The size of array must be larger than 0");
+
+ public:
+  HOSTDEVICE Array() {}
+
+  HOSTDEVICE explicit Array(const T &val) {
+    for (size_t i = 0; i < N; ++i) data_[i] = val;
+  }
+
+  HOSTDEVICE const T *Get() const { return data_; }
+
+  HOSTDEVICE T *GetMutable() { return data_; }
+
+  HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
+
+  HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
+
+  HOSTDEVICE constexpr size_t size() const { return N; }
+
+ private:
+  T data_[N];
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 8f6c4163d6ee11fbe83f603f6148c2ac6175324d..abd5459f6d47da6d1341284916b419325dc5977c 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -42,3 +42,5 @@ cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_b
 cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
+cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
+        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 716d674fa29bad9321fc20979775c06f26bf4679..5183be878eb49cccc68603c3fdd8023be5578036 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -19,10 +19,13 @@ namespace framework {
 namespace details {
 
 struct ExecutionStrategy {
+  enum ExecutorType { kDefault = 0, kExperimental = 1 };
+
   size_t num_threads_{0};
   bool use_cuda_{true};
   bool allow_op_delay_{false};
   size_t num_iteration_per_drop_scope_{100};
+  ExecutorType type_{kDefault};
 };
 
 }  //  namespace details
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7606f2bc06b2ecf07c5649eeae1a2d5587a8880c
--- /dev/null
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> &&graph)
+    : strategy_(strategy),
+      local_scopes_(local_scopes),
+      places_(places),
+      graph_(std::move(graph)),
+      pool_(strategy.num_threads_ +
+            1),  // add one more thread for generate op_deps
+      fetch_ctxs_(places) {
+  auto &ops = graph_->Get<details::GraphOps>("ops");
+
+  for (auto &op : ops) {
+    int dep = static_cast<int>(op->NotReadyInputSize());
+    op_deps_.emplace(op.get(), dep);
+    if (dep == 0) {
+      bootstrap_ops_.emplace_back(op.get());
+    }
+  }
+
+  PrepareAtomicOpDeps();
+}
+
+FeedFetchList FastThreadedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>
+      op_deps = atomic_op_deps_.get();
+  PrepareAtomicOpDeps();
+
+  paddle::framework::FeedFetchList fetches;
+  fetches.resize(fetch_tensors.size());
+  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
+  std::vector<std::unique_ptr<ir::Node>> fetch_nodes;
+  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+
+  for (auto &fetch_var_name : fetch_tensors) {
+    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+      auto it = var_map.find(fetch_var_name);
+      if (it != var_map.end()) {
+        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
+      }
+    }
+  }
+
+  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
+    auto &var_name = fetch_tensors[i];
+    auto fetched_var_it = fetched_vars.find(var_name);
+    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
+                   "Cannot find fetched variable.(Perhaps the main_program "
+                   "is not set to ParallelExecutor)");
+
+    auto &vars = fetched_var_it->second;
+
+    fetch_nodes.emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *op = new FetchOpHandle(fetch_nodes.back().get(), &fetches, i,
+                                 &local_scopes_);
+    fetch_ops.emplace_back(op);
+
+    for (auto &p : places_) {
+      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
+    }
+
+    for (auto *var : vars) {
+      op->AddInput(var);
+    }
+
+    (*op_deps)[op] = static_cast<int>(op->NotReadyInputSize());
+  }
+
+  size_t num_complete = 0;
+  remaining_ = 0;
+  BlockingQueue<size_t> complete_q;
+  for (auto op : bootstrap_ops_) {
+    RunOpAsync(op_deps.get(), op, &complete_q);
+  }
+
+  while (num_complete != op_deps->size()) {
+    size_t num_comp = complete_q.Pop();
+    if (num_comp == -1UL) {
+      int remaining = 0;
+      while (true) {
+        remaining = remaining_;
+        if (remaining == 0) {
+          break;
+        }
+        for (int i = 0; i < remaining; ++i) {
+          complete_q.Pop();
+        }
+      }
+      exception_.ReThrow();
+    }
+    num_complete += num_comp;
+  }
+  // Wait FetchOps.
+  if (!fetch_ops.empty()) {
+    fetch_ops.clear();
+  }
+  return fetches;
+}
+void FastThreadedSSAGraphExecutor::RunOpAsync(
+    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
+    OpHandleBase *op, BlockingQueue<size_t> *complete_q) {
+  ++remaining_;
+  this->pool_.enqueue([=] {
+    OpHandleBase *op_to_run = op;
+    size_t complete = 0;
+    while (op_to_run != nullptr) {
+      try {
+        op_to_run->Run(strategy_.use_cuda_);
+        ++complete;
+      } catch (...) {
+        exception_.Catch(std::current_exception());
+        --remaining_;
+        complete_q->Push(-1UL);
+        return;
+      }
+      auto &outputs = op_to_run->Outputs();
+      op_to_run = nullptr;
+      for (auto &output : outputs) {
+        for (auto &pending_op : output->PendingOps()) {
+          std::atomic<int> &deps = op_deps->at(pending_op);
+          if (deps.fetch_sub(1) == 1) {  // pending_op ready
+            if (op_to_run == nullptr) {
+              op_to_run = pending_op;
+            } else {
+              this->RunOpAsync(op_deps, pending_op, complete_q);
+            }
+          }
+        }
+      }
+    }
+    --remaining_;
+    complete_q->Push(complete);
+  });
+}
+void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
+  atomic_op_deps_ = pool_.enqueue([&] {
+    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps =
+        new std::unordered_map<OpHandleBase *, std::atomic<int>>;
+    for (auto &pair : op_deps_) {
+      (*op_deps)[pair.first] = pair.second;
+    }
+    return std::unique_ptr<
+        std::unordered_map<OpHandleBase *, std::atomic<int>>>(op_deps);
+  });
+}
+
+const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..dad3a231cba6402f57ba654a9ac5fb520b9c8f04
--- /dev/null
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/exception_holder.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+
+class OpHandleBase;
+class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
+                               const std::vector<Scope *> &local_scopes,
+                               const std::vector<platform::Place> &places,
+                               std::unique_ptr<ir::Graph> &&graph);
+  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
+  const ir::Graph &Graph() const override;
+
+ private:
+  ExecutionStrategy strategy_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+  std::unique_ptr<ir::Graph> graph_;
+
+  std::unordered_map<OpHandleBase *, int> op_deps_;
+  std::vector<OpHandleBase *> bootstrap_ops_;
+
+  ::ThreadPool pool_;
+  platform::DeviceContextPool fetch_ctxs_;
+  std::atomic<int> remaining_;
+
+  void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
+                  OpHandleBase *op, BlockingQueue<size_t> *complete_q);
+
+  void PrepareAtomicOpDeps();
+
+  std::future<
+      std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
+      atomic_op_deps_;
+  ExceptionHolder exception_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index c5a13e7e1f45e1eb9b4271880630c52d30022f4b..bc61b0eacbf6c8a1fd4487ad5a442fed1b536345 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -763,6 +763,8 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
 // Create RPC related op handles that connects its in ops and out ops.
 void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
                                           ir::Node *node) const {
+  // FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode
+  //                     put them into transpiler.
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
     // TODO(paddle-dev): getting the first var is not safe.
@@ -771,26 +773,42 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
                    "This hack no longer holds, please fix.");
     // the variable name which contains .block means it was splited by
     // split_byref op
-    // so that we can balance the variable blocks to all the pserver
-    // instances.
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
         node->inputs[0]->Name().find(".block") == std::string::npos) {
       std::vector<std::string> input_var_names;
       for (ir::Node *n : node->inputs) {
         input_var_names.push_back(n->Name());
       }
-      op_dev_id = GetAppropriateDeviceID(input_var_names);
+      auto send_param_grad = boost::get<std::vector<std::string>>(
+          node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
+      op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
+      VLOG(10) << "send grad " << input_var_names[0] << " origin "
+               << send_param_grad[1] << " place: " << op_dev_id;
       for (auto &varname : input_var_names) {
         result->Get<ShardedVarDevice>(kShardedVarDevice)
             .emplace(varname, op_dev_id);
       }
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(send_param_grad[1], op_dev_id);
     }
   } else if (node->Op()->Type() == "recv") {
     std::vector<std::string> output_var_names;
     for (ir::Node *n : node->outputs) {
       output_var_names.push_back(n->Name());
     }
-    op_dev_id = GetAppropriateDeviceID(output_var_names);
+    auto recv_param_grad = boost::get<std::vector<std::string>>(
+        node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+    // FIXME(typhoonzero): assume each recv op output one param
+    // Use the same place as send.
+    if (recv_param_grad.size() == 2U) {
+      op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
+      VLOG(10) << "recv param " << recv_param_grad[0]
+               << " get grad place: " << recv_param_grad[1]
+               << " place: " << op_dev_id;
+    } else {
+      op_dev_id = GetAppropriateDeviceID(output_var_names);
+    }
     for (auto &varname : output_var_names) {
       result->Get<ShardedVarDevice>(kShardedVarDevice)
           .emplace(varname, op_dev_id);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
index 69944a42b688a9ea5ff29f75f18dd4b156848a27..361c91dc78c08a2cbf84ee88211d389c1e2312e5 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -54,7 +54,8 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
       sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
            << "\\n"
            << var_handle_ptr->place_ << "\\n"
-           << var_handle_ptr->version_ << "\"]" << std::endl;
+           << "scope: " << var_handle_ptr->scope_idx_ << "\\n"
+           << "v" << var_handle_ptr->version_ << "\"]" << std::endl;
     } else if (dummy_ptr) {
       sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
     }
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index ee9f9184da65467b82794c99fe3e95b108373753..3812f0abf1b7069525c4420054c61c01c908acfe 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -158,6 +158,16 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
 #endif
 }
 
+size_t OpHandleBase::NotReadyInputSize() const {
+  std::unordered_set<VarHandleBase *> res;
+  for (auto *var : inputs_) {
+    if (var->GeneratedOp() != nullptr) {
+      res.emplace(var);
+    }
+  }
+  return res.size();
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 2d7f18942890245249dd0619a40bb43833c9a2ee..9fbefabc841e3f6940860f60d959fee97495e4c9 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -81,6 +81,8 @@ class OpHandleBase {
     return res.size();
   }
 
+  size_t NotReadyInputSize() const;
+
   const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
 
   size_t NoDummyInputSize() const;
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 923a7083d4f30b646bbab03d79992b275aa2b403..da0955a9a000e0d0bff3fe9d0bc3bd25171be3d2 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -5,8 +5,12 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)
+cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detecter)
+cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass)
+
 
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
+cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detecter graph pass graph_traits framework_proto)
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4327742eac843f27385c165216ce48ceb97ea71
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -0,0 +1,192 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+bool VarOutLinksToOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->outputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void BuildFCPattern(PDPattern* pattern) {
+  // make sure the selected MUL op has one input argument is a parameter.
+  auto* mul_parameter_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() && node->outputs.size() == 1UL &&
+               node->outputs.front()->Op()->Type() == "mul" && node->Var() &&
+               node->Var()->Persistable();  // check is a parameter
+      },
+      "mul_weight" /*name*/);
+
+  auto* mul_tmp_input_var = pattern->NewNode(
+      [](Node* node) {
+        bool result =
+            node->IsVar() && node->outputs.size() >= 1UL && node->Var() &&
+            !node->Var()->Persistable();  // this input is not an parameter.
+        if (!result) return false;
+        // check whether one output is MUL op.
+        for (auto* op : node->outputs) {
+          if (op->IsOp() && op->Op()->Type() == "mul") return true;
+        }
+        return false;
+      },
+      "mul_tmp_var" /*name*/);
+
+  // select a MUL op
+  auto* mul_op = pattern->NewNode(
+      [](Node* node) {
+        return node->IsOp() &&               // start from an Op
+               node->Op()->Type() == "mul";  // type is mul
+        // the output should be consumed only by one element_add, that check
+        // leaves in a Var PDNode.
+      },
+      "mul" /*name*/);
+
+  // make sure the MUL op's output has only one consumer and links to an
+  // ELEMENTWISE_ADD op.
+  auto* mul_out_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() &&                  // starts from a Var
+               node->outputs.size() == 1UL &&    // only has one consumer
+               node->outputs.front()->IsOp() &&  // check basic logic
+               node->Var() &&                    // not a ControlDepVar
+               node->outputs.front()->Op()->Type() ==
+                   "elementwise_add";  // a very strong validation
+      },
+      "mul_out");
+  // this check is not essential, just to make the corresponding variable Node
+  // retrival easier.
+  auto* elementwise_add_tmp_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() && node->outputs.size() >= 1UL && node->Var() &&
+               VarOutLinksToOp(node, "elementwise_add");
+      },
+      "elementwise_add_tmpvar");
+
+  // select an ELEMENTWISE_ADD op
+  auto* elementwise_add_op = pattern->NewNode(
+      [](Node* node) {
+        return node->IsOp() && node->Op()->Type() == "elementwise_add";
+      },
+      "elementwise_add" /*name*/);
+
+  // get the ELEMENTWISE_ADD op's output
+  auto* elementwise_add_out_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() && node->inputs.size() == 1UL && node->Var() &&
+               node->inputs.front()->Op()->Type() == "elementwise_add";
+      },
+      "elementwise_add_out");
+
+  pattern->AddEdge(mul_parameter_var, mul_op);
+  pattern->AddEdge(mul_tmp_input_var, mul_op);
+  pattern->AddEdge(mul_op, mul_out_var);
+  pattern->AddEdge(mul_out_var, elementwise_add_op);
+  pattern->AddEdge(elementwise_add_tmp_var, elementwise_add_op);
+  pattern->AddEdge(elementwise_add_op, elementwise_add_out_var);
+}
+
+// Replace the node `from` in the links to `to`
+bool LinksReplace(std::vector<Node*>* links, Node* from, Node* to) {
+  for (auto*& n : *links) {
+    if (n == from) {
+      n = to;
+      return true;
+    }
+  }
+  return false;
+}
+
+std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+
+  std::unordered_set<Node*> nodes2delete;
+
+  GraphPatternDetecter gpd;
+  BuildFCPattern(gpd.mutable_pattern());
+
+#define GET_NODE(id)                                             \
+  PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetriveNode(#id)), \
+                 "pattern has no Node called %s", #id);          \
+  auto* id = subgraph.at(gpd.pattern().RetriveNode(#id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+
+  auto handler = [&](const GraphPatternDetecter::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle FC fuse";
+    // Currently, there is no FC op available, so I will just simulate the
+    // scenerio.
+    // FC's fusion is simple, just op fuse, no need to process the
+    // parameters.
+    GET_NODE(mul_tmp_var);             // x
+    GET_NODE(mul_weight);              // Y
+    GET_NODE(elementwise_add_tmpvar);  // bias
+    GET_NODE(elementwise_add_out);     // Out
+    GET_NODE(mul);                     // MUL op
+    GET_NODE(elementwise_add);         // ELEMENT_ADD op
+    GET_NODE(mul_out);                 // tmp
+#undef GET_NODE
+
+    // Create an FC Node.
+    OpDesc desc;
+    std::string fc_x_in = mul_tmp_var->Name();
+    std::string fc_Y_in = mul_weight->Name();
+    std::string fc_bias_in = elementwise_add_tmpvar->Name();
+    std::string fc_out = elementwise_add_out->Name();
+    desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
+    desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
+    desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
+    desc.SetOutput("Out", std::vector<std::string>({fc_out}));
+    desc.SetType("fc");
+    auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
+    fc_node->inputs =
+        std::vector<Node*>({mul_tmp_var, mul_weight, elementwise_add_tmpvar});
+    fc_node->outputs.push_back(elementwise_add_out);
+
+    // Update link relatons
+    PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node));
+    PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node));
+    PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs,
+                                elementwise_add, fc_node));
+    PADDLE_ENFORCE(
+        LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node));
+
+    // Drop old nodes
+    graph->RemoveNode(mul);
+    graph->RemoveNode(elementwise_add);
+    graph->RemoveNode(mul_out);  // tmp variable
+  };
+
+  gpd(graph.get(), handler);
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass);
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb43dd4486cda578804fb9f6438c67e9e4a03091
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the MUL and ELEMENTWISE_ADD to a FCOp.
+ */
+class FCFusePass : public Pass {
+ public:
+  virtual ~FCFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..87ba417b1a43475f48380009f8e5cd84699b8e40
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetInput("Xs", inputs);
+  op->SetOutput("Ys", outputs);
+}
+
+// a->OP0->b
+// a->OP1->c
+// (b, c)->mul->d
+// (d, e)->elementwise_add->f
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "c") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
+        std::vector<std::string>({"d"}));
+  SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
+        std::vector<std::string>({"f"}));
+
+  return prog;
+}
+
+TEST(FCFusePass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("fc_fuse_pass");
+
+  int pre_nodes = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int after_nodes = graph->Nodes().size();
+
+  // Remove 3 Nodes: MUL,ELEMENTWISE_ADD, mul_out
+  // Add 1 Node: FC
+  EXPECT_EQ(pre_nodes - 2, after_nodes);
+
+  // Assert fc op in newly generated graph
+  int fc_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "fc") {
+      ++fc_count;
+    }
+  }
+  EXPECT_EQ(fc_count, 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index f87d5212c0cd87a5a63cf2d54ca677516ab45816..2a6bf4ac230df81b38751000bf4b663f24984db3 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -117,7 +117,15 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
     }
     // For output args, always create a new var.
     for (auto &each_var_name : op->OutputArgumentNames()) {
-      ir::Node *var = CreateVarNode(all_vars.at(each_var_name));
+      ir::Node *var = nullptr;
+      if (all_vars.count(each_var_name) != 0) {
+        var = CreateVarNode(all_vars.at(each_var_name));
+      } else {
+        // Operation output vars can be @EMPTY@. For example, while_grad
+        // can have multi @EMPTY@ outputs with no VarDesc.
+        // TODO(panyx0718): Add a test.
+        var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
+      }
       var_nodes[each_var_name].push_back(var);
       node->outputs.push_back(var);
       var->inputs.push_back(node);
@@ -208,7 +216,8 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
       // Add write after write dependence
       ir::Node *upstream_op =
           (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
-      if (upstream_op) {
+      // TODO(zcd): Add a test.
+      if (upstream_op && upstream_op != write_op) {
         ir::Node *dep_var = CreateControlDepVar();
         write_op->inputs.push_back(dep_var);
         upstream_op->outputs.push_back(dep_var);
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 5736a5c4e232698085936303d1f23760649f8245..0d27be5fc007746d6ca41ff0dbcea5c5f45599ef 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -98,11 +98,13 @@ class Graph {
 
   // Create a normal variable with non-null VarDesc.
   ir::Node *CreateVarNode(VarDesc *var_desc) {
+    PADDLE_ENFORCE(var_desc);
     return AddNode(new ir::Node(var_desc));
   }
 
   // Create a normal runnable operator with OpDesc.
   ir::Node *CreateOpNode(OpDesc *op_desc) {
+    PADDLE_ENFORCE(op_desc);
     return AddNode(new ir::Node(op_desc));
   }
 
@@ -134,6 +136,12 @@ class Graph {
     return ret;
   }
 
+  void RemoveNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
+    node_set_.erase(node);
+    nodes_.erase(node);
+  }
+
  private:
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
@@ -143,14 +151,8 @@ class Graph {
     return node;
   }
 
-  void RemoveNode(ir::Node *node) {
-    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
-    node_set_.erase(node);
-    nodes_.erase(node);
-  }
-
   // NOTE: program_ shouldn't be exposed to user.
-  const ProgramDesc &program_;
+  const ProgramDesc program_;
   std::map<std::string, boost::any> attrs_;
   std::map<std::string, std::function<void(void)>> attr_dels_;
   std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index b1c19e6535150130822e9f48685241e62de5b064..dc81a2cac585b50b81f79f8f204ce1145d93eab0 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -104,7 +104,7 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
       for (auto &adj_n : var->inputs) {
         PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
         adj_list[n].insert(adj_n);
-        VLOG(3) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
       }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.cc b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
index dcc4382792d6a21dc10c37d2173d8cf1989219a5..e197861251fe5c9f98eaaba2a10b4af371dcbcba 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detecter.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
@@ -25,12 +25,30 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+size_t PDPattern::id_ = 0UL;
+
 PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
+  if (!name.empty()) {
+    PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
+                      "PDNode's name should be unique, get duplicate [%s]",
+                      name);
+  }
+
   nodes_.emplace_back(new PDNode(std::move(teller), name));
   auto* cur = nodes_.back().get();
+  node_map_[name] = cur;
   return cur;
 }
 
+PDNode* PDPattern::RetriveNode(const std::string& id) const {
+  auto it = node_map_.find(id);
+  if (it == node_map_.end()) {
+    return nullptr;
+  }
+
+  return it->second;
+}
+
 void PDPattern::AddEdge(PDNode* a, PDNode* b) {
   PADDLE_ENFORCE(a);
   PADDLE_ENFORCE(b);
@@ -51,15 +69,18 @@ void GraphPatternDetecter::operator()(Graph* graph,
 }
 
 bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) {
+  VLOG(4) << "mark pdnodes in graph";
   if (graph.Nodes().empty()) return false;
 
   for (auto& node : GraphTraits::DFS(graph)) {
     for (const auto& pdnode : pattern_.nodes()) {
       if (pdnode->Tell(&node)) {
+        VLOG(4) << "pdnode " << pdnode->name() << " marked";
         pdnodes2nodes_[pdnode.get()].insert(&node);
       }
     }
   }
+  VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
   return !pdnodes2nodes_.empty();
 }
 
@@ -67,10 +88,20 @@ struct HitGroup {
   std::unordered_map<PDNode*, Node*> roles;
 
   bool Match(Node* node, PDNode* pat) {
+    if (nodes_.count(node)) {
+      if (!roles.count(pat)) return false;
+      return roles[pat] == node;
+    }
     return !roles.count(pat) || roles.at(pat) == node;
   }
 
-  void Register(Node* node, PDNode* pat) { roles[pat] = node; }
+  void Register(Node* node, PDNode* pat) {
+    roles[pat] = node;
+    nodes_.insert(node);
+  }
+
+ private:
+  std::unordered_set<Node*> nodes_;
 };
 
 // Tell whether Node a links to b.
@@ -104,6 +135,7 @@ GraphPatternDetecter::DetectPatterns() {
   // Extend a PDNode to subgraphs by deducing the connection relations defined
   // in edges of PDNodes.
   for (const auto& edge : pattern_.edges()) {
+    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
     // Each role has two PDNodes, which indicates two roles.
     // Detect two Nodes that can match these two roles and they are connected.
     auto& pre_groups = bi_records[step % 2];
@@ -127,6 +159,7 @@ GraphPatternDetecter::DetectPatterns() {
         }
       }
     }
+    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
   }
 
   for (auto& group : bi_records[step % 2]) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.h b/paddle/fluid/framework/ir/graph_pattern_detecter.h
index 1778bf00000f60e5cf8b2a585bf7e5dae0a582eb..68c39902b5a79bf25ca7f08529a958274ac64e33 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detecter.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.h
@@ -96,7 +96,8 @@ class PDPattern {
 
   void AddEdge(PDNode* a, PDNode* b);
 
-  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = "");
+  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
+  PDNode* RetriveNode(const std::string& id) const;
 
   const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
   const std::vector<edge_t>& edges() const { return edges_; }
@@ -107,8 +108,12 @@ class PDPattern {
   FRIEND_TEST(PDPattern, NewNode);
 #endif
 
+  static std::string NewID() { return "pdnode-" + std::to_string(id_++); }
+
   std::vector<std::unique_ptr<PDNode>> nodes_;
   std::vector<edge_t> edges_;
+  std::unordered_map<std::string, PDNode*> node_map_;
+  static size_t id_;
 };
 
 /*
diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
index 993c885a810fe80a170ed190b892b148d85e8b5f..06f9df5546910f492c9dd1da3e694623898d3d1d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
@@ -163,8 +163,8 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
   // 3. Detect op2 -> var2 -> op4
   // 4. Detect op2 -> var3 -> op5
   // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
-  ASSERT_GE(count, 1UL);
-  ASSERT_LE(count, 2UL);
+  ASSERT_GE(count, 1);
+  ASSERT_LE(count, 2);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index b1b8d1c586c98a327a8e5b4890ced00022155e6b..cadda49c399a6d65079cacedfea61f4fd580a69a 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -200,9 +200,11 @@ TEST(GraphTest, WriteAfterWrite) {
       ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
       control_dep2 = n->inputs[1];
       ASSERT_EQ(n->inputs.size(), 2);
-      ASSERT_EQ(control_dep1, control_dep2);
     }
   }
+  ASSERT_NE(control_dep1, nullptr);
+  ASSERT_NE(control_dep2, nullptr);
+  ASSERT_EQ(control_dep1, control_dep2);
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h
index edbe45acb98326ee3bf1d86495832ec8469b634e..f42bab20ed97e372d2da0c4a492a4458ab94e0a0 100644
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
@@ -12,7 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
+
 #include <stack>
+#include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 8cb812d1388bf74d173a4dc7a99561e730f8e95a..e7ff0c1dac134334e3baad88886862ebff0fe367 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -25,6 +25,7 @@ static const char kGraphVizPath[] = "graph_viz_path";
 std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
+  VLOG(3) << "draw IR graph viz to " << graph_viz_path;
   std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
   PADDLE_ENFORCE(fout->good());
   std::ostream& sout = *fout;
diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f885567da1965b997b2063e06c839af95b43e1e1
--- /dev/null
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferCleanGraphPass : public Pass {
+ public:
+  virtual ~InferCleanGraphPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
+    PADDLE_ENFORCE(graph.get());
+
+    auto is_valid_node = [](Node* x) {
+      return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
+    };
+
+    std::unordered_set<Node*> invalid_nodes;
+    for (auto* node : graph->Nodes()) {
+      if (is_valid_node(node)) {
+        invalid_nodes.insert(node);
+      }
+    }
+
+    // remove nodes from the graph.
+    for (auto* node : invalid_nodes) {
+      graph->RemoveNode(node);
+    }
+
+    // clean edges.
+    for (auto* node : graph->Nodes()) {
+      CleanEdges(&node->inputs, invalid_nodes);
+      CleanEdges(&node->outputs, invalid_nodes);
+    }
+
+    return graph;
+  }
+
+  void CleanEdges(std::vector<Node*>* nodes,
+                  const std::unordered_set<Node*>& to_remove) const {
+    auto it = std::remove_if(nodes->begin(), nodes->end(),
+                             [&](Node* x) { return to_remove.count(x); });
+    nodes->erase(it, nodes->end());
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(infer_clean_graph_pass,
+              paddle::framework::ir::InferCleanGraphPass);
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index aca77da8d674f29b89c023717cdcd061232d023a..65c45c7d2038cd06168d50c202dc81b4389cc5ed 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
-const char Node::kControlDepVarName[] = "__control_var";
+constexpr char Node::kControlDepVarName[];
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 9c0765ab8ce16733ac021aefc8c7b2bb779319f3..aab3180e7e5ece5de5f5227e76f78687700fed87 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -27,21 +27,21 @@ namespace ir {
 class Node {
  public:
   enum class Type { kOperation, kVariable };
-  static const char kControlDepVarName[];
+  static constexpr char kControlDepVarName[] = "__control_var";
 
   explicit Node(const std::string& name, Type type)
       : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
 
   explicit Node(VarDesc* var_desc)
       : name_(var_desc->Name()),
-        var_desc_(var_desc),
+        var_desc_(new VarDesc(*var_desc)),
         op_desc_(nullptr),
         type_(Type::kVariable) {}
 
   explicit Node(OpDesc* op_desc)
       : name_(op_desc->Type()),
         var_desc_(nullptr),
-        op_desc_(op_desc),
+        op_desc_(new OpDesc(*op_desc, op_desc->Block())),
         type_(Type::kOperation) {}
 
   Type NodeType() const { return type_; }
@@ -50,12 +50,12 @@ class Node {
 
   VarDesc* Var() {
     PADDLE_ENFORCE(type_ == Type::kVariable);
-    return var_desc_;
+    return var_desc_.get();
   }
 
   OpDesc* Op() {
-    PADDLE_ENFORCE(type_ == Type::kOperation);
-    return op_desc_;
+    PADDLE_ENFORCE(IsOp());
+    return op_desc_.get();
   }
 
   bool IsOp() const { return type_ == Type::kOperation; }
@@ -66,8 +66,8 @@ class Node {
 
  protected:
   const std::string name_;
-  VarDesc* var_desc_;
-  OpDesc* op_desc_;
+  std::unique_ptr<VarDesc> var_desc_;
+  std::unique_ptr<OpDesc> op_desc_;
   Type type_;
 
  private:
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 2288c7fe6609a765612b468d69ad35101b92b384..9c289243c5a27839f628f3e143ce0363bf75a0b1 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -129,6 +129,10 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                                     "Optimized for variable")
       .SetDefault({});
 
+  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
+                                    "Callstack for Op Creatation.")
+      .SetDefault({});
+
   Validate();
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 80970291c9c234f1306162f4ffa3c2528f88c35f..cb9c8ab1704ab867182079db31a34125669c645b 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -39,6 +39,7 @@ class OpProtoAndCheckerMaker {
  public:
   static const char *OpRoleAttrName() { return "op_role"; }
   static const char *OpRoleVarAttrName() { return "op_role_var"; }
+  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
 
   void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d04f7744961b2561977f4d36d0073a97557043da..9f8cdf1aeba43d30676cb2adf80a77cab86547a8 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -11,15 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-
+#include "paddle/fluid/framework/operator.h"
 #include <algorithm>
-
+#include <sstream>
+#include <string>
+#include <vector>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -127,19 +129,48 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(4) << place << " " << DebugStringEx(&scope);
-  if (platform::is_gpu_place(place)) {
+  try {
+    if (VLOG_IS_ON(4)) {
+      VLOG(4) << place << " " << DebugStringEx(&scope);
+    }
+    if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-    PADDLE_THROW("Cannot run operator on place %s", place);
+      PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-    platform::SetDeviceId(dev_id);
+      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+      platform::SetDeviceId(dev_id);
 #endif
+    }
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+    RunImpl(scope, place);
+    if (VLOG_IS_ON(3)) {
+      VLOG(3) << place << " " << DebugStringEx(&scope);
+    }
+  } catch (platform::EnforceNotMet exception) {
+    if (Attrs().count("sub_block") != 0) {
+      throw exception;
+    }
+
+    auto& callstack = Attr<std::vector<std::string>>(
+        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+
+    if (callstack.empty()) {
+      throw exception;
+    }
+    std::ostringstream sout;
+    sout << "Invoke operator " << Type() << " error.\n";
+    sout << "Python Callstacks: \n";
+    for (auto& line : callstack) {
+      sout << line;
+    }
+    sout << "C++ Callstacks: \n";
+    sout << exception.err_str_;
+    exception.err_str_ = sout.str();
+    throw exception;
+  } catch (...) {
+    std::rethrow_exception(std::current_exception());
   }
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  platform::RecordEvent record_event(Type(), pool.Get(place));
-  RunImpl(scope, place);
-  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -167,7 +198,7 @@ const std::vector<std::string>& OperatorBase::Inputs(
 }
 
 bool OperatorBase::HasOutputs(const std::string& name) const {
-  if (outputs_.find(name) != outputs_.end()) {
+  if (outputs_.end() != outputs_.find(name)) {
     return true;
   } else {
     return false;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 275cb8c592c3c0b153d31149570cd6596b9e1a7f..81cb24bdda6b87a3d708cf5047dce05d5020a0d5 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
@@ -193,8 +194,14 @@ ParallelExecutor::ParallelExecutor(
       member_->local_scopes_, member_->use_cuda_, build_strategy);
 #endif
 
-  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, places, std::move(graph)));
+  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+  } else {
+    member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+  }
+
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, std::move(var_infos),
       member_->places_, std::move(member_->executor_)));
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 20bdc7830f32564448a69e9cd76c02585b7a1aca..344c001a69b53c82967ee983783892a514c2490b 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -55,11 +55,20 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
     auto all_ops = blocks_[block_id]->AllOps();
     for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
       auto &op = all_ops[op_id];
+
       for (const std::string &attr_name : op->AttrNames()) {
         if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
           int sub_block_id =
               o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
           op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
+        } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) {
+          std::vector<int> sub_block_ids =
+              o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name);
+          std::vector<BlockDesc *> block_descs;
+          for (int block_id : sub_block_ids) {
+            block_descs.push_back(MutableBlock(block_id));
+          }
+          op->SetBlocksAttr(attr_name, block_descs);
         }
       }
     }
@@ -68,24 +77,16 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
 
 ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
   desc_ = desc;
-  for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDesc(this, &block_desc));
-  }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
-        }
-      }
-    }
-  }
+  InitFromProto();
 }
 
 ProgramDesc::ProgramDesc(const std::string &binary_str) {
   PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
                  "Fail to parse program_desc from binary string.");
+  InitFromProto();
+}
+
+void ProgramDesc::InitFromProto() {
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
@@ -95,6 +96,13 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
         if (attr.type() == proto::AttrType::BLOCK) {
           size_t blk_idx = attr.block_idx();
           op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+        } else if (attr.type() == proto::AttrType::BLOCKS) {
+          auto blks_idx = attr.blocks_idx();
+          std::vector<BlockDesc *> block_descs;
+          for (int blk_idx : blks_idx) {
+            block_descs.push_back(this->MutableBlock(blk_idx));
+          }
+          op->SetBlocksAttr(attr.name(), block_descs);
         }
       }
     }
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 65fa0a0cfd5ba6d9b8765cee1309e118cb74348a..f3afc85eb924e4b03b7597e043ffd4e267adc977 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -76,6 +76,8 @@ class ProgramDesc {
   void SetFetchHolderName(const std::string &fetch_holder_name);
 
  private:
+  void InitFromProto();
+
   proto::ProgramDesc desc_;
 
   std::vector<std::unique_ptr<BlockDesc>> blocks_;
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 6c46e9aad5b7fbf67fdcc07a12e7932ac8b6412b..925ea98dbe62e4da91689f6e56c135e51c24a8a3 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -42,6 +42,19 @@ TEST(ProgramDesc, copy_ctor) {
   out->SetType(proto::VarType::LOD_TENSOR);
   op->SetOutput("Y", {out->Name()});
 
+  BlockDesc* new_block = program.AppendBlock(*global_block);
+  op = new_block->AppendOp();
+  op->SetType("mul");
+
+  op = global_block->AppendOp();
+  op->SetType("op_with_subblock");
+  op->SetAttr("sub_block", new_block);
+
+  std::vector<BlockDesc*> sub_blocks;
+  sub_blocks.push_back(program.AppendBlock(*global_block));
+  sub_blocks.push_back(program.AppendBlock(*global_block));
+  op->SetAttr("sub_blocks", sub_blocks);
+
   ProgramDesc program_copy(program);
 
   auto* global_block_copy = program_copy.MutableBlock(0);
@@ -64,6 +77,8 @@ TEST(ProgramDesc, copy_ctor) {
   assert_same_var("Y", y);
   assert_same_var("Out", out);
 
+  bool found_sub_block = false;
+  bool found_sub_blocks = false;
   for (size_t i = 0; i < global_block->OpSize(); ++i) {
     auto op_origin = global_block->Op(i);
     auto op_copy = global_block_copy->Op(i);
@@ -74,8 +89,17 @@ TEST(ProgramDesc, copy_ctor) {
 
     ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
               op_origin->Proto()->SerializeAsString());
-  }
 
+    if (op->Type() == "op_with_subblock") {
+      ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
+      found_sub_block = true;
+
+      ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size());
+      found_sub_blocks = true;
+    }
+  }
+  ASSERT_TRUE(found_sub_block);
+  ASSERT_TRUE(found_sub_blocks);
   // Not check block's protostr are same it because the order of vars could be
   // different and it is correct.
 }
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
new file mode 100644
index 0000000000000000000000000000000000000000..1418fb5134fdde2392da912b5f1bd9fc74e58400
--- /dev/null
+++ b/paddle/fluid/framework/rw_lock.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <pthread.h>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+struct RWLock {
+  RWLock() { pthread_rwlock_init(&lock_, nullptr); }
+
+  ~RWLock() { pthread_rwlock_destroy(&lock_); }
+
+  void RDLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
+                      "acquire read lock failed");
+  }
+
+  void WRLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
+                      "acquire write lock failed");
+  }
+
+  void UNLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+  }
+
+ private:
+  pthread_rwlock_t lock_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/rw_lock_test.cc b/paddle/fluid/framework/rw_lock_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..16f9cbb65229f10912ee90436c3557aaaca169b8
--- /dev/null
+++ b/paddle/fluid/framework/rw_lock_test.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/rw_lock.h"
+#include <gtest/gtest.h>
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
+#include <vector>
+
+namespace f = paddle::framework;
+
+void f1(f::RWLock *lock) {
+  lock->RDLock();
+  lock->UNLock();
+}
+
+TEST(RWLOCK, read_read) {
+  f::RWLock lock;
+  lock.RDLock();
+  std::thread t1(f1, &lock);
+  std::thread t2(f1, &lock);
+  t1.join();
+  t2.join();
+  lock.UNLock();
+}
+
+void f2(f::RWLock *lock, std::vector<int> *result) {
+  lock->RDLock();
+  ASSERT_EQ(result->size(), 0UL);
+  lock->UNLock();
+}
+
+void f3(f::RWLock *lock, std::vector<int> *result) {
+  lock->WRLock();
+  result->push_back(1);
+  lock->UNLock();
+}
+
+TEST(RWLOCK, read_write) {
+  f::RWLock lock;
+  std::vector<int> result;
+
+  lock.RDLock();
+  std::thread t1(f2, &lock, &result);
+  t1.join();
+  std::thread t2(f3, &lock, &result);
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  ASSERT_EQ(result.size(), 0UL);
+  lock.UNLock();
+  t2.join();
+  ASSERT_EQ(result.size(), 1UL);
+}
+
+void f4(f::RWLock *lock, std::vector<int> *result) {
+  lock->RDLock();
+  ASSERT_EQ(result->size(), 1UL);
+  lock->UNLock();
+}
+
+TEST(RWLOCK, write_read) {
+  f::RWLock lock;
+  std::vector<int> result;
+
+  lock.WRLock();
+  std::thread t1(f4, &lock, &result);
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  result.push_back(1);
+  lock.UNLock();
+  t1.join();
+}
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 06ed87e7e8a2d5324b48a466b05207042ec1b7fa..a4319ffabb04f39437b76d97845e021ef9de66d3 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -120,66 +120,76 @@ bool SelectedRows::HasKey(int64_t key) const {
                                                                    : true;
 }
 
-std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
-    const std::vector<int64_t>& keys, framework::Tensor* value) const {
+int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
+  rwlock_->RDLock();
+  auto iter = id_to_index_.find(key);
+  if (iter == id_to_index_.end()) {
+    rwlock_->UNLock();
+    if (!auto_grown) {
+      PADDLE_THROW("key %d not found", key);
+    }
+    rwlock_->WRLock();
+    auto map_size = id_to_index_.size();
+    auto vector_size = rows_.size();
+    if (map_size != vector_size) {
+      rwlock_->UNLock();
+      PADDLE_THROW(
+          "id_to_index_ size %d should have the same size with rows_ %d",
+          map_size, vector_size);
+    }
+    auto write_iter = id_to_index_.find(key);
+    if (write_iter == id_to_index_.end()) {
+      int row_num = rows_.size();
+      if (row_num == value_->dims()[0]) {
+        rwlock_->UNLock();
+        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+      }
+      // key logic to put a key into id_to_index_
+      rows_.push_back(key);
+      auto index = static_cast<int64_t>(rows_.size() - 1);
+      id_to_index_[key] = index;
+      rwlock_->UNLock();
+      return index;
+    } else {
+      auto index = write_iter->second;
+      rwlock_->UNLock();
+      return index;
+    }
+  } else {
+    auto index = iter->second;
+    rwlock_->UNLock();
+    return index;
+  }
+}
+
+void SelectedRows::SyncIndex() {
+  rwlock_->WRLock();
+  id_to_index_.clear();
+  for (size_t i = 0; i < rows_.size(); ++i) {
+    id_to_index_[rows_[i]] = i;
+  }
+  rwlock_->UNLock();
+}
+
+void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
+                       bool auto_grown) {
   PADDLE_ENFORCE(value->IsInitialized(),
                  "The value tensor should be initialized.");
-  std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
-  if (keys.empty()) {
+  if (ids.numel() == 0) {
     VLOG(3) << "keys is empty, please check data!";
   } else {
     int64_t value_width = value_->numel() / value_->dims()[0];
     PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
                       "output tensor should have the same shape with table "
                       "except the dims[0].");
-
-    for (size_t i = 0; i < keys.size(); ++i) {
-      int64_t index = Index(keys[i]);
-      if (index == -1) {
-        non_keys_pair.push_back(
-            std::make_pair(keys[i], static_cast<int64_t>(i)));
-      } else {
-        framework::VisitDataType(
-            framework::ToDataType(value_->type()),
-            TensorCopyVisitor(value, i * value_width, *value_.get(),
-                              index * value_width, value_width));
-      }
+    for (int i = 0; i < ids.numel(); ++i) {
+      int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
+      framework::VisitDataType(
+          framework::ToDataType(value_->type()),
+          TensorCopyVisitor(value, i * value_width, *value_.get(),
+                            index * value_width, value_width));
     }
   }
-  return non_keys_pair;
-}
-
-bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
-  PADDLE_ENFORCE(value.IsInitialized(), "The value should be initialized.");
-  if (value_->IsInitialized()) {
-    PADDLE_ENFORCE_EQ(
-        value.type(), value_->type(),
-        "The type of the value should be same with the original value");
-  }
-  PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
-                    "The first dim of value should be 1.");
-  std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
-  auto index = Index(key);
-  bool is_new_key = false;
-  if (index == -1) {
-    rows_.push_back(key);
-    index = rows_.size() - 1;
-    is_new_key = true;
-    // whether need to resize the table
-    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
-      auto dims = value_->dims();
-      dims[0] = (dims[0] + 1) << 1;
-      framework::VisitDataType(framework::ToDataType(value.type()),
-                               ReAllocateVisitor(dims, value_.get()));
-    }
-  }
-
-  framework::VisitDataType(
-      framework::ToDataType(value.type()),
-      TensorCopyVisitor(value_.get(),
-                        index * value_->numel() / value_->dims()[0], value,
-                        static_cast<int64_t>(0), value.numel()));
-  return is_new_key;
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 7160670ddd204c20021ea87cdd67ee4721d03451..daf5e95304fb84eaba26a30c45414d5021e7ffcb 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -17,10 +17,12 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 
@@ -48,13 +50,13 @@ class SelectedRows {
   SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
       : rows_(rows), height_(height) {
     value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
   }
 
   SelectedRows() {
     height_ = 0;
     value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
   }
 
   platform::Place place() const { return value_->place(); }
@@ -74,47 +76,51 @@ class SelectedRows {
   void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
 
   /*
-   * @brief wheter has the specified key in the table.
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      PADDLE_THROW("id %s not in table", key);
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
+  /*
+   * @brief whether has the specified key in the table.
    *
    * @return true if the key is exists.
    */
   bool HasKey(int64_t key) const;
 
   /*
-   * @brief Get value by the key list, if the
+   * @brief Get value by the key list.
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
    *
    * @return a list of pair which contains the non-exists key and the index in
    * the value
    */
-  std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys,
-                                               framework::Tensor* value) const;
+  void Get(const framework::Tensor& ids, framework::Tensor* value,
+           bool auto_grown = false);
 
   /*
-   * @brief Set a key-value pair into the table.
-   *  This function will double the value memory if it's not engouth.
+   * @brief Get the index of the key from id_to_index_ map. If the key not
+   * exist,
+   * add the key into id_to_index_.
    *
-   * @note:
-   *    1. The first dim of the value should be 1
-   *    2. The value should be initialized and the data type
-   *       should be the same with the table.
-   *
-   * @return true if the key is a new one, otherwise false
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
    *
+   * @return index of the key.
    */
-  bool Set(int64_t key, const Tensor& value);
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown);
 
-  /*
-   * @brief Get the index of key in rows
-   *
-   * @return -1 if the key does not exists.
-   */
-  int64_t Index(int64_t key) const {
-    auto it = std::find(rows_.begin(), rows_.end(), key);
-    if (it == rows_.end()) {
-      return static_cast<int64_t>(-1);
-    }
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
+  void SyncIndex();
 
   DDim GetCompleteDims() const {
     std::vector<int64_t> dims = vectorize(value_->dims());
@@ -127,9 +133,10 @@ class SelectedRows {
   // SelectedRows are simply concated when adding together. Until a
   // SelectedRows add a Tensor, will the duplicate rows be handled.
   Vector<int64_t> rows_;
+  std::unordered_map<int64_t, int64_t> id_to_index_;
   std::unique_ptr<Tensor> value_{nullptr};
   int64_t height_;
-  std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
+  std::unique_ptr<RWLock> rwlock_{nullptr};
 };
 
 /*
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
index eefcaa5672c5a3debf162f5c8eda653408dcf221..5ca864cfdf7176850dd31dd42ef3306061a742cf 100644
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -9,8 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/selected_rows.h"
+#include <time.h>
+#include <thread>  // NOLINT
+
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/selected_rows.h"
 
 namespace paddle {
 namespace framework {
@@ -59,39 +62,129 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
   ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
 }
 
-TEST_F(SelectedRowsTester, SparseTable) {
+TEST(SelectedRows, SparseTable) {
   platform::CPUPlace cpu;
   SelectedRows table;
+
+  int64_t table_size = 100;
+  int64_t embedding_width = 8;
   // initialize a sparse table
-  table.mutable_value()->Resize(framework::make_ddim({1, 100}));
-  table.mutable_value()->mutable_data<float>(cpu);
-  table.mutable_rows()->push_back(1);
+  table.mutable_value()->Resize(
+      framework::make_ddim({table_size, embedding_width}));
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
+  ASSERT_EQ(table.AutoGrownIndex(10, true), 0);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(6, true), 2);
+  ASSERT_TRUE(table.HasKey(10));
+  ASSERT_TRUE(table.HasKey(8));
+  ASSERT_TRUE(table.HasKey(6));
+  ASSERT_EQ(table.rows().size(), 3);
+
+  framework::Tensor ids;
+  ids.Resize(framework::make_ddim({4}));
+  auto* ids_data = ids.mutable_data<int64_t>(cpu);
+  ids_data[0] = static_cast<int64_t>(6);
+  ids_data[1] = static_cast<int64_t>(6);
+  ids_data[2] = static_cast<int64_t>(8);
+  ids_data[3] = static_cast<int64_t>(10);
 
-  int64_t key = 10000;
-  int64_t non_key = 999;
-  framework::Tensor value;
-  value.Resize(framework::make_ddim({1, 100}));
-  auto ptr = value.mutable_data<float>(cpu);
-  ptr[0] = static_cast<float>(10);
+  framework::Tensor get_value;
+  auto* value_data = get_value.mutable_data<float>(
+      framework::make_ddim({4, embedding_width}), cpu);
+  table.Get(ids, &get_value);
 
-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(1));
-  ASSERT_EQ(table.HasKey(key), false);
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[0 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[1 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[2 * embedding_width + j], 1);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[3 * embedding_width + j], 0);
+  }
+}
 
-  table.Set(key, value);
+void f1(SelectedRows* table, int table_size) {
+  for (int i = 1000000; i > 0; --i) {
+    auto id = i % table_size;
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}
 
-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(2));
-  ASSERT_EQ(table.HasKey(key), true);
-  // check re-allocate
-  ASSERT_EQ(table.value().dims()[0], static_cast<int64_t>(4));
+void f2(SelectedRows* table, int table_size) {
+  for (int i = 0; i < 1000000; ++i) {
+    auto id = i % table_size;
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}
 
-  framework::Tensor get_value;
-  get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
-  std::vector<int64_t> keys({non_key, key});
-  auto non_key_pairs = table.Get(keys, &get_value);
+void f3(SelectedRows* table, int table_size) {
+  clock_t t1 = clock();
+  for (int i = 100000; i > 0; --i) {
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f3 run time:" << t2 - t1 << std::endl;
+}
+
+void f4(SelectedRows* table, int table_size) {
+  clock_t t1 = clock();
+  for (int i = 0; i < 100000; ++i) {
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f4 run time:" << t2 - t1 << std::endl;
+}
+
+TEST(SelectedRows, MultiThreadAutoIndex) {
+  platform::CPUPlace cpu;
+  SelectedRows table;
+
+  int64_t table_size = 100000;
+  int64_t embedding_width = 8;
+  // initialize a sparse table
+  table.mutable_value()->Resize(
+      framework::make_ddim({table_size, embedding_width}));
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
 
-  ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
-  ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1));
-  ASSERT_EQ(non_key_pairs[0].first, non_key);
+  std::thread t1(f1, &table, table_size);
+  std::thread t11(f1, &table, table_size);
+  std::thread t2(f2, &table, table_size);
+  std::thread t22(f2, &table, table_size);
+  t1.join();
+  t11.join();
+  t2.join();
+  t22.join();
+  std::thread t3(f3, &table, table_size);
+  std::thread t4(f4, &table, table_size);
+  t3.join();
+  t4.join();
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 27fe575cb6167a726ff92a8f3d2e47b6f536ba39..4feaed2b0d9cdec735bd3fadc98aa2bad715c209 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,13 +1,17 @@
+cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
 cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+  analyzer.cc
+  helper.cc
+  # passes
   fluid_to_data_flow_graph_pass.cc
   data_flow_graph_to_fluid_pass.cc
   dfg_graphviz_draw_pass.cc
   tensorrt_subgraph_pass.cc
   tensorrt_subgraph_node_mark_pass.cc
-  analyzer.cc
-  helper.cc
-        model_store_pass.cc
-  DEPS framework_proto proto_desc)
+  fluid_to_ir_pass.cc
+  model_store_pass.cc
+  DEPS framework_proto proto_desc ir_pass_manager graph pass)
+
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
@@ -18,7 +22,7 @@ function (inference_analysis_test TARGET)
     if(WITH_TESTING)
         set(options "")
         set(oneValueArgs "")
-        set(multiValueArgs SRCS)
+        set(multiValueArgs SRCS EXTRA_DEPS)
         cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
         set(mem_opt "")
@@ -27,19 +31,51 @@ function (inference_analysis_test TARGET)
         endif()
         cc_test(${TARGET}
                 SRCS "${analysis_test_SRCS}"
-                DEPS analysis
+                DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detecter pass ${analysis_test_EXTRA_DEPS}
                 ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
         set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
     endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 
+set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
+set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
+set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
+set(DITU_RNN_MODEL ${DITU_INSTALL_DIR}/model)
+set(DITU_RNN_DATA ${DITU_INSTALL_DIR}/data.txt)
+
+function (inference_download_and_uncompress target url gz_filename)
+    message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
+    execute_process(COMMAND bash -c "mkdir -p ${DITU_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && wget -q ${url}")
+    execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && tar xzf ${gz_filename}")
+    message(STATUS "finish downloading ${gz_filename}")
+endfunction(inference_download_and_uncompress)
+
+if (NOT EXISTS ${DITU_INSTALL_DIR})
+    inference_download_and_uncompress(ditu_rnn_model ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
+    inference_download_and_uncompress(ditu_rnn_data ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
+endif()
+
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
+		# ir
+		fc_fuse_pass
+		graph_viz_pass
+		infer_clean_graph_pass
+		graph_pattern_detecter
+        infer_clean_graph_pass
+		pass
+    ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
+        --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
+        --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
+
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
 inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
 inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 9318f1089781b30468cf4d3c7151d0dd26e50a9c..7d16364609463e9c48720e772cebee7731dfd452 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -17,22 +17,23 @@
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
 #include "paddle/fluid/inference/analysis/model_store_pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
 
-namespace paddle {
-
-DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true,
+DEFINE_bool(IA_enable_tensorrt_subgraph_engine, false,
             "Enable subgraph to TensorRT engine for acceleration");
 
-DEFINE_string(inference_analysis_graphviz_log_root, "./",
+DEFINE_bool(IA_enable_ir, false, "Turn on IR support");
+
+DEFINE_string(IA_graphviz_log_root, "./",
               "Graphviz debuger for data flow graphs.");
 
-DEFINE_string(inference_analysis_output_storage_path, "",
-              "optimized model output path");
+DEFINE_string(IA_output_storage_path, "", "optimized model output path");
 
+namespace paddle {
 namespace inference {
 namespace analysis {
 
@@ -40,11 +41,38 @@ class DfgPassManagerImpl final : public DfgPassManager {
  public:
   DfgPassManagerImpl() {
     // TODO(Superjomn) set the key with pass reprs.
-    AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
-    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
+    LOG(INFO)
+        << "-----------------------------------------------------------------";
+    if (FLAGS_IA_enable_ir) {
+      AddPass("fluid-to-ir-pass", new FluidToIrPass);
+    } else {
+      AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    }
+    TryAddTensorRtPass();
+    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+    if (!FLAGS_IA_output_storage_path.empty()) {
+      AddPass("model-store-pass", new ModelStorePass);
+    }
+    LOG(INFO)
+        << "-----------------------------------------------------------------";
+  }
+
+  std::string repr() const override { return "dfg-pass-manager"; }
+  std::string description() const override { return "DFG pass manager."; }
+
+ private:
+  void AddPass(const std::string& name, Pass* pass) {
+    VLOG(3) << "Adding pass " << name;
+    Register(name, pass);
+    AddGraphvizDebugerPass(pass);
+  }
+
+  void TryAddTensorRtPass() {
+    if (FLAGS_IA_enable_tensorrt_subgraph_engine) {
       auto trt_teller = [&](const Node* node) {
         std::unordered_set<std::string> teller_set(
-            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"});
+            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
+             "depthwise_conv2d", "batch_norm"});
         if (!node->IsFunction()) return false;
 
         const auto* func = static_cast<const Function*>(node);
@@ -59,20 +87,6 @@ class DfgPassManagerImpl final : public DfgPassManager {
               new TensorRTSubgraphNodeMarkPass(trt_teller));
       AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
     }
-    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
-    if (!FLAGS_inference_analysis_output_storage_path.empty()) {
-      AddPass("model-store-pass", new ModelStorePass);
-    }
-  }
-
-  std::string repr() const override { return "dfg-pass-manager"; }
-  std::string description() const override { return "DFG pass manager."; }
-
- private:
-  void AddPass(const std::string& name, Pass* pass) {
-    LOG(INFO) << "Adding pass " << name;
-    Register(name, pass);
-    AddGraphvizDebugerPass(pass);
   }
 
   // Add the graphviz debuger pass if the parent pass has one.
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index c82fdfff86c91b4e07e3c1b80987d3d8d796ad23..2e107c82dd50d5cf22797f4c82e69d302514f955 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -39,14 +39,14 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 
-namespace paddle {
-
 // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
 // flag if not available.
-DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
-DECLARE_string(inference_analysis_graphviz_log_root);
-DECLARE_string(inference_analysis_output_storage_path);
+DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
+DECLARE_string(IA_graphviz_log_root);
+DECLARE_string(IA_output_storage_path);
+DECLARE_bool(IA_enable_ir);
 
+namespace paddle {
 namespace inference {
 namespace analysis {
 
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 24bfb3993cf569561980006b6627b56327dd0f67..baa7600283a9bc0b81833b419a2ea64692ed2203 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -13,15 +13,25 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
+
 #include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
+DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
+DEFINE_int32(batch_size, 10, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
 TEST(Analyzer, analysis_without_tensorrt) {
-  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
   Argument argument;
   argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
@@ -29,13 +39,331 @@ TEST(Analyzer, analysis_without_tensorrt) {
 }
 
 TEST(Analyzer, analysis_with_tensorrt) {
-  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = true;
   Argument argument;
   argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
 
+void TestWord2vecPrediction(const std::string &model_path) {
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  auto predictor =
+      ::paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+          config);
+
+  // One single batch
+
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> slots(4, tensor);
+  std::vector<PaddleTensor> outputs;
+  CHECK(predictor->Run(slots, &outputs));
+
+  PADDLE_ENFORCE(outputs.size(), 1UL);
+  // Check the output buffer size and result of each tid.
+  PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+  float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                     0.000932706};
+  const size_t num_elements = outputs.front().data.length() / sizeof(float);
+  // The outputs' buffers are in CPU memory.
+  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+    LOG(INFO) << "data: "
+              << static_cast<float *>(outputs.front().data.data())[i];
+    PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i],
+                   result[i]);
+  }
+}
+
+namespace {
+
+struct DataRecord {
+  std::vector<std::vector<std::vector<float>>> link_step_data_all;
+  std::vector<std::vector<float>> week_data_all, minute_data_all;
+  std::vector<size_t> lod1, lod2, lod3;
+  std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
+      rnn_minute_datas;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= link_step_data_all.size()) {
+      data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
+                                     link_step_data_all.begin() + batch_end);
+      data.week_data_all.assign(week_data_all.begin() + batch_iter,
+                                week_data_all.begin() + batch_end);
+      data.minute_data_all.assign(minute_data_all.begin() + batch_iter,
+                                  minute_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod1.push_back(0);
+      data.lod2.push_back(0);
+      data.lod3.push_back(0);
+      CHECK(!data.link_step_data_all.empty()) << "empty";
+      CHECK(!data.week_data_all.empty());
+      CHECK(!data.minute_data_all.empty());
+      CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size());
+      CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size());
+      for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
+        for (const auto &d : data.link_step_data_all[j]) {
+          data.rnn_link_data.push_back(d);
+        }
+        data.rnn_week_datas.push_back(data.week_data_all[j]);
+        data.rnn_minute_datas.push_back(data.minute_data_all[j]);
+        // calculate lod
+        data.lod1.push_back(data.lod1.back() +
+                            data.link_step_data_all[j].size());
+        data.lod3.push_back(data.lod3.back() + 1);
+        for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) {
+          data.lod2.push_back(data.lod2.back() +
+                              data.link_step_data_all[j].size());
+        }
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ':', &data);
+      std::vector<std::vector<float>> link_step_data;
+      std::vector<std::string> link_datas;
+      split(data[0], '|', &link_datas);
+      for (auto &step_data : link_datas) {
+        std::vector<float> tmp;
+        split_to_float(step_data, ',', &tmp);
+        link_step_data.push_back(tmp);
+      }
+      // load week data
+      std::vector<float> week_data;
+      split_to_float(data[2], ',', &week_data);
+      // load minute data
+      std::vector<float> minute_data;
+      split_to_float(data[1], ',', &minute_data);
+      link_step_data_all.push_back(std::move(link_step_data));
+      week_data_all.push_back(std::move(week_data));
+      minute_data_all.push_back(std::move(minute_data));
+    }
+  }
+};
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
+      week_tensor, minute_tensor;
+  lod_attention_tensor.name = "data_lod_attention";
+  init_zero_tensor.name = "cell_init";
+  lod_tensor_tensor.name = "data";
+  week_tensor.name = "week";
+  minute_tensor.name = "minute";
+  auto one_batch = data->NextBatch();
+  std::vector<int> rnn_link_data_shape(
+      {static_cast<int>(one_batch.rnn_link_data.size()),
+       static_cast<int>(one_batch.rnn_link_data.front().size())});
+  lod_attention_tensor.shape.assign({1, 2});
+  lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
+  init_zero_tensor.shape.assign({batch_size, 15});
+  init_zero_tensor.lod.assign({one_batch.lod3});
+  lod_tensor_tensor.shape = rnn_link_data_shape;
+  lod_tensor_tensor.lod.assign({one_batch.lod1});
+  // clang-format off
+  week_tensor.shape.assign(
+      {static_cast<int>(one_batch.rnn_week_datas.size()),
+       static_cast<int>(one_batch.rnn_week_datas.front().size())});
+  week_tensor.lod.assign({one_batch.lod3});
+  minute_tensor.shape.assign(
+      {static_cast<int>(one_batch.rnn_minute_datas.size()),
+       static_cast<int>(one_batch.rnn_minute_datas.front().size())});
+  minute_tensor.lod.assign({one_batch.lod3});
+  // clang-format on
+  // assign data
+  TensorAssignData(&lod_attention_tensor,
+                   std::vector<std::vector<float>>({{0, 0}}));
+  std::vector<float> tmp_zeros(batch_size * 15, 0.);
+  TensorAssignData(&init_zero_tensor, {tmp_zeros});
+  TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data);
+  TensorAssignData(&week_tensor, one_batch.rnn_week_datas);
+  TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas);
+  // Set inputs.
+  auto init_zero_tensor1 = init_zero_tensor;
+  init_zero_tensor1.name = "hidden_init";
+  input_slots->assign({week_tensor, init_zero_tensor, minute_tensor,
+                       init_zero_tensor1, lod_attention_tensor,
+                       lod_tensor_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::FLOAT32;
+  }
+}
+
+std::string DescribeTensor(const PaddleTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name << "]\n";
+  os << " - type: ";
+  switch (tensor.dtype) {
+    case PaddleDType::FLOAT32:
+      os << "float32";
+      break;
+    case PaddleDType::INT64:
+      os << "int64";
+      break;
+    default:
+      os << "unset";
+  }
+  os << '\n';
+
+  os << " - shape: " << to_string(tensor.shape) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+
+  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  for (int i = 0; i < dim; i++) {
+    os << static_cast<float *>(tensor.data.data())[i] << " ";
+  }
+  os << '\n';
+  return os.str();
+}
+
+}  // namespace
+
+const float ditu_rnn_target_data[] = {
+    104.711, 11.2431, 1.35422, 0,       0,       0,       0,       0,
+    27.7039, 1.41486, 7.09526, 0,       0,       0,       0,       0,
+    7.6481,  6.5324,  56.383,  2.88018, 8.92918, 132.007, 4.27429, 2.02934,
+    14.1727, 10.7461, 25.0616, 16.0197, 14.4163, 16.9199, 6.75517, 0,
+    80.0249, 4.77739, 0,       0,       0,       0,       0,       0,
+    47.5643, 2.67029, 8.76252, 0,       0,       0,       0,       0,
+    51.8822, 4.4411,  0,       0,       0,       0,       0,       0,
+    10.7286, 12.0595, 10.6672, 0,       0,       0,       0,       0,
+    93.5771, 3.84641, 0,       0,       0,       0,       0,       0,
+    169.426, 0,       0,       0,       0,       0,       0,       0};
+// Test with a really complicate model.
+void TestDituRNNPrediction(const std::string &model_path,
+                           const std::string &data_path, int batch_size,
+                           bool use_analysis, bool activate_ir,
+                           int num_times = 1) {
+  FLAGS_IA_enable_ir = activate_ir;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
+  FLAGS_IA_output_storage_path = "./analysis.out";
+
+  std::string model_out;
+  if (use_analysis) {
+    Argument argument(model_path);
+    argument.model_output_store_path.reset(new std::string("./analysis.out"));
+
+    Analyzer analyzer;
+    analyzer.Run(&argument);
+
+    // Should get the transformed model stored to ./analysis.out
+    model_out = "./analysis.out";
+    ASSERT_TRUE(PathExists(model_out));
+  } else {
+    model_out = FLAGS_infer_ditu_rnn_model;
+  }
+
+  NativeConfig config;
+  config.prog_file = model_out + "/__model__";
+  config.param_file = model_out + "/param";
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  std::vector<PaddleTensor> input_slots;
+  DataRecord data(data_path, batch_size);
+  // Prepare inputs.
+  PrepareInputs(&input_slots, &data, batch_size);
+  std::vector<PaddleTensor> outputs;
+
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < num_times; i++) {
+    predictor->Run(input_slots, &outputs);
+  }
+  LOG(INFO) << "===========profile result===========";
+  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
+            << ", latency: " << timer.toc() / num_times << "ms";
+  LOG(INFO) << "=====================================";
+
+  for (auto &out : outputs) {
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    float *data = static_cast<float *>(out.data.data());
+    for (size_t i = 0;
+         i < std::min(sizeof(ditu_rnn_target_data) / sizeof(float), size);
+         i++) {
+      EXPECT_NEAR(data[i], ditu_rnn_target_data[i], 1e-3);
+    }
+  }
+}
+
+// Turn on the IR pass supportion, run a real inference and check the result.
+TEST(Analyzer, SupportIRPass) {
+  FLAGS_IA_enable_ir = true;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
+  FLAGS_IA_output_storage_path = "./analysis.out";
+
+  Argument argument(FLAGS_inference_model_dir);
+  argument.model_output_store_path.reset(new std::string("./analysis.out"));
+
+  Analyzer analyzer;
+  analyzer.Run(&argument);
+
+  // Should get the transformed model stored to ./analysis.out
+  ASSERT_TRUE(PathExists("./analysis.out"));
+
+  // Inference from this path.
+  TestWord2vecPrediction("./analysis.out");
+}
+
+// Directly infer with the original model.
+TEST(Analyzer, DituRNN_without_analysis) {
+  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+                        FLAGS_batch_size, false, false, FLAGS_repeat);
+}
+
+// Inference with the original model with the analysis turned on, the analysis
+// module will transform the program to a data flow graph.
+TEST(Analyzer, DituRNN_with_analysis) {
+  LOG(INFO) << "ditu rnn with analysis";
+  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+                        FLAGS_batch_size, true, false, FLAGS_repeat);
+}
+
+// Inference with analysis and IR. The IR module will fuse some large kernels.
+TEST(Analyzer, DituRNN_with_analysis_with_IR) {
+  LOG(INFO) << "ditu rnn with analysis and IR fuse";
+  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+                        FLAGS_batch_size, true, true, FLAGS_repeat);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(infer_clean_graph_pass);
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index 7f64bc75ae8ad40a268739cdc36051e76af9f49a..100a7504b8526b3587858dd7783913757ba09895 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -19,14 +19,16 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 namespace analysis {
+using ir_node_t = framework::ir::Node;
+using ir_graph_t = framework::ir::Graph;
 
 // It is a better idea that the inputs and outputs of this graph is set manually
 // before, but there must be a Pass that helps to prune the unnecessary ops that
 // do not contribute to the given targets, so in this pass, analysis and get the
 // inputs and outputs is OK.
 void DataFlowGraph::Build() {
-  inputs.clear();
-  outputs.clear();
+  inputs_.clear();
+  outputs_.clear();
   std::unordered_set<Node *> ins;
   std::unordered_set<Node *> outs;
   for (auto &node : nodes.nodes()) {
@@ -42,18 +44,140 @@ void DataFlowGraph::Build() {
   // similarly, the nodes that in outs but not in ins is the graphs' outputs
   for (auto *in : ins) {
     if (!outs.count(in)) {
-      inputs.push_back(in);
+      inputs_.push_back(in);
     }
   }
   for (auto *out : outs) {
-    if (!outs.count(out)) {
-      outputs.push_back(out);
+    if (!ins.count(out)) {
+      outputs_.push_back(out);
     }
   }
 
   Clean();
 }
 
+void DataFlowGraph::Build(const framework::proto::ProgramDesc &prog) {
+  // insert vars
+  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
+  // will keep updating to its latest alias during the graph-building.
+  std::unordered_map<std::string, size_t> var2id;
+  auto &main_block = prog.blocks(framework::kRootBlockIndex);
+  for (int i = 0; i < main_block.vars_size(); i++) {
+    const auto &var = main_block.vars(i);
+    auto *v = nodes.Create(Node::Type::kValue);
+    v->SetName(var.name());
+    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbMsg(var.SerializeAsString());
+    var2id[var.name()] = v->id();
+  }
+
+  // The variables in a SSA can only write once, so if a variable is written
+  // multiple times(quite common in our ProgramDesc design), multiple alias
+  // Nodes of this variable will be created, and each will just write once.
+
+  // An set that keep all the names of the variables(the original, not alias)
+  // that have been written(as outputs). Once an Op's output variable hit the
+  // set, it should create a new alias and update the global alias for this
+  // variable. And that make a Data Flow Graph a SSA.
+  std::unordered_set<Node *> unique_written_vars;
+  for (int i = 0; i < main_block.ops_size(); i++) {
+    const auto &op = main_block.ops(i);
+    auto *o = nodes.Create(Node::Type::kFunction);
+    o->SetName(op.type());
+    static_cast<Function *>(o)->SetFuncType(op.type());
+    // Link to the original protobuf message's memory, make it easier to
+    // generate from a data flow graph to fluid ProgramDesc.
+    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbMsg(op.SerializeAsString());
+
+    // set inputs and outputs
+    for (int j = 0; j < op.inputs_size(); j++) {
+      auto &in_var = op.inputs(j);
+      for (int k = 0; k < in_var.arguments_size(); k++) {
+        auto *in = nodes.GetMutable(var2id.at(in_var.arguments(k)));
+        in->outlinks.push_back(o);
+        o->inlinks.push_back(in);
+        unique_written_vars.insert(in);
+      }
+    }
+    for (int j = 0; j < op.outputs_size(); j++) {
+      auto &out_var = op.outputs(j);
+      for (int k = 0; k < out_var.arguments_size(); k++) {
+        auto *out = nodes.GetMutable(var2id[out_var.arguments(k)]);
+        if (unique_written_vars.count(out)) {
+          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
+          auto *out_alias = nodes.Create(Node::Type::kValue);
+          out_alias->SetName(out->name());
+          out_alias->SetPbDesc(out->pb_desc());
+          out_alias->SetPbMsg(out->pb_msg());
+          var2id[out_alias->name()] =
+              out_alias->id();  // update variable's alias Node
+          LOG(INFO) << "loop found in graph, create SSA alias node ["
+                    << out_alias->repr() << "] for [" << out->repr() << "]";
+          out = out_alias;
+        }
+        out->inlinks.push_back(o);
+        o->outlinks.push_back(out);
+      }
+    }
+  }
+  // Analysis and extract the inputs and outputs of this graph.
+  Build();
+}
+
+void DataFlowGraph::Build(const framework::ir::Graph &graph) {
+  // Create nodes
+  std::unordered_map<ir_node_t *, Node *> ir_node_map;
+  for (auto *ir_node : graph.Nodes()) {
+    Node *x{nullptr};
+    if (ir_node->IsOp()) {
+      PADDLE_ENFORCE(ir_node->Op());
+      VLOG(4) << "get op " << ir_node << " " << ir_node->Name();
+      x = nodes.Create(Node::Type::kFunction);
+      x->attr("ir_node").Pointer() = ir_node;
+      PADDLE_ENFORCE(ir_node->Op()->Proto());
+      x->SetName(ir_node->Op()->Proto()->type());
+      x->SetPbMsg(ir_node->Op()->Proto()->SerializeAsString());
+    } else if (ir_node->IsVar()) {
+      // Not create a Node for IR ControlDepVar, considering Inference currently
+      // just used in single thread scenerio.
+      VLOG(4) << "get var " << ir_node->Name();
+      x = nodes.Create(Node::Type::kValue);
+      x->attr("ir_node").Pointer() = ir_node;
+      x->SetName(ir_node->Name());
+      // x->SetPbMsg(ir_node->Var()->Proto()->SerializeAsString());
+    } else {
+      PADDLE_THROW("Failed to create an Node from IR, unknown type");
+    }
+    ir_node_map.emplace(ir_node, x);
+  }
+  VLOG(4) << "finish creating Nodes";
+
+  VLOG(4) << "to create edge";
+  // Create links
+  for (auto *ir_node : graph.Nodes()) {
+    auto it = ir_node_map.find(ir_node);
+    // Skip ControlDepVar.
+    if (it == ir_node_map.end()) continue;
+    auto *node = it->second;
+    for (auto *x : ir_node->inputs) {
+      if (!ir_node_map.count(x)) continue;
+      node->inlinks.push_back(ir_node_map.at(x));
+    }
+    for (auto *x : ir_node->outputs) {
+      if (!ir_node_map.count(x)) continue;
+      node->outlinks.push_back(ir_node_map.at(x));
+    }
+  }
+
+  Build();
+  PADDLE_ENFORCE(!inputs_.empty(),
+                 "Can't deduce any inputs from the graph, Is the graph empty?");
+
+  ir_graph = &graph;
+  VLOG(3) << "finished build from IR";
+}
+
 void DataFlowGraph::Clean() {
   for (auto &node : nodes.nodes()) {
     std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
@@ -61,11 +185,9 @@ void DataFlowGraph::Clean() {
     std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
                                             node->outlinks.end());
     if (inlinks_set.size() < node->inlinks.size()) {
-      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
       node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
     }
     if (outlinks_set.size() < node->outlinks.size()) {
-      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
       node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
     }
   }
@@ -112,10 +234,10 @@ GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
     const std::vector<Node *> &source)
     : queue_(source.begin(), source.end()) {}
 
-// GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
-//     GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
-//     : queue_(std::move(other.queue_)),
-//       visited_(std::move(other.visited_)) {}
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
+    : queue_(std::move(other.queue_)),
+      visited_(std::move(other.visited_)) {}
 
 GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
     const GraphTraits<DataFlowGraph>::NodesBFSIterator &other)
@@ -159,7 +281,7 @@ bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
   if (queue_.empty()) return other.queue_.empty();
   if ((!queue_.empty()) && (!other.queue_.empty())) {
     return queue_.front() == other.queue_.front() &&
-           visited_.size() == other.visited_.size();  // here need to check the
+           visited_.size() == other.visited_.size();
     // equality of queue and
     // visited. Just a light but week implementation.
   }
@@ -174,10 +296,10 @@ GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
   for (auto *x : source) stack_.push(x);
 }
 
-// GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
-//     GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
-//     : stack_(std::move(other.stack_)),
-//       visited_(std::move(other.visited_)) {}
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
+    : stack_(std::move(other.stack_)),
+      visited_(std::move(other.visited_)) {}
 
 GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
     const GraphTraits<DataFlowGraph>::NodesDFSIterator &other)
@@ -339,7 +461,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
 
 void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
   std::vector<Node *> op_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
     if (node.type() == Node::Type::kValue || node.deleted()) {
       continue;
     }
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index bb3ec6bbc1d9555386aba8837b019d2511653258..437e097acd24aad384df6712ce0de6106b3b5c65 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/inference/analysis/graph_traits.h"
 #include "paddle/fluid/inference/analysis/node.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -41,19 +42,43 @@ namespace analysis {
  */
 struct DataFlowGraph {
   NodeMap nodes;
-  std::vector<Node *> inputs;
-  std::vector<Node *> outputs;
+  // inputs and outputs are deduced from the graph.
+  // Used to interact with IR.
+  const framework::ir::Graph *ir_graph{nullptr};
 
   // Extract inputs and outputs of the graph.
   void Build();
 
+  void Build(const framework::proto::ProgramDesc &prog);
+
+  // Build a graph from ir::Graph.
+  void Build(const framework::ir::Graph &graph);
+
+  // Get an attribute.
+  AnyAttr &Attr(const std::string &key) { return attrs_[key]; }
+
   // Output a DOT graph file for debug.
   std::string DotString() const;
 
   std::string HumanReadableInfo(bool show_values = true,
                                 bool show_functions = true) const;
 
+  const std::vector<Node *> &inputs() const {
+    PADDLE_ENFORCE(!inputs_.empty(),
+                   "No inputs are deduced, need to Build() first.");
+    return inputs_;
+  }
+  const std::vector<Node *> &outputs() const {
+    PADDLE_ENFORCE(!outputs_.empty(),
+                   "No outputs are deduced, need to Build() first.");
+    return outputs_;
+  }
+
  private:
+  mutable std::vector<Node *> inputs_;
+  mutable std::vector<Node *> outputs_;
+  std::unordered_map<std::string, AnyAttr> attrs_;
+
   // Remove duplicate edges and so on.
   void Clean();
 };
@@ -70,7 +95,7 @@ struct GraphTraits<DataFlowGraph> {
       : public std::iterator<std::forward_iterator_tag, Node *> {
     NodesBFSIterator() = default;
     explicit NodesBFSIterator(const std::vector<Node *> &source);
-    // NodesBFSIterator(NodesBFSIterator &&other) noexcept;
+    NodesBFSIterator(NodesBFSIterator &&other) noexcept;
     // NOTE Heavy to use.
     NodesBFSIterator(const NodesBFSIterator &other);
 
@@ -93,8 +118,8 @@ struct GraphTraits<DataFlowGraph> {
   struct NodesDFSIterator
       : public std::iterator<std::forward_iterator_tag, Node *> {
     NodesDFSIterator() = default;
-    explicit NodesDFSIterator(const std::vector<Node *> &source);
-    // NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+    NodesDFSIterator(const std::vector<Node *> &source);
+    NodesDFSIterator(NodesDFSIterator &&other) noexcept;
     NodesDFSIterator(const NodesDFSIterator &other);
 
     Node &operator*();
@@ -116,7 +141,7 @@ struct GraphTraits<DataFlowGraph> {
   struct NodesTSIterator
       : public std::iterator<std::forward_iterator_tag, Node *> {
     NodesTSIterator() = default;
-    explicit NodesTSIterator(const std::vector<Node *> &source);
+    NodesTSIterator(const std::vector<Node *> &source);
     NodesTSIterator(NodesTSIterator &&other)
         : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
       other.cursor_ = 0;
@@ -138,7 +163,7 @@ struct GraphTraits<DataFlowGraph> {
     size_t cursor_{0};
   };
 
-  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
+  explicit GraphTraits(const DataFlowGraph &graph) : graph_(graph) {}
 
   // default use BFS to visit the nodes.
   iterator_range<NodesBFSIterator> nodes() {
@@ -156,20 +181,20 @@ struct GraphTraits<DataFlowGraph> {
 
  private:
   NodesBFSIterator nodes_bfs_begin() {
-    return NodesBFSIterator(graph_->inputs);
+    return NodesBFSIterator(graph_.inputs());
   }
   NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
 
   NodesDFSIterator nodes_dfs_begin() {
-    return NodesDFSIterator(graph_->inputs);
+    return NodesDFSIterator(graph_.inputs());
   }
   NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
 
-  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); }
+  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_.inputs()); }
   NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
 
  private:
-  DataFlowGraph *graph_;
+  const DataFlowGraph &graph_;
 };
 
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index a881262665f156812da9e1576aa29b05fc398499..1682011c3d8cc9927a4b026b370671798cace625 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
 namespace paddle {
@@ -24,20 +25,18 @@ TEST(DataFlowGraph, BFS) {
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
 
-  for (auto *in : dfg.inputs) {
+  for (auto* in : dfg.inputs()) {
     LOG(INFO) << "inputs: " << in->name() << " "
               << static_cast<int>(in->type());
   }
-  for (auto *out : dfg.outputs) {
+  for (auto* out : dfg.outputs()) {
     LOG(INFO) << "outputs: " << out->name() << " "
               << static_cast<int>(out->type());
   }
 
-  GraphTraits<DataFlowGraph> trait(&dfg);
-  auto nodes = trait.nodes();
   size_t count = 0;
-  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
-    LOG(INFO) << "visiting " << it->name();
+  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes()) {
+    LOG(INFO) << "visiting " << node.name();
     ++count;
   }
   ASSERT_EQ(count, dfg.nodes.size());
@@ -45,13 +44,11 @@ TEST(DataFlowGraph, BFS) {
 
 TEST(DataFlowGraph, DFS) {
   auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  auto dfg = ProgramDescToDFG(desc);
-  dfg.Build();
-  GraphTraits<DataFlowGraph> trait(&dfg);
-  auto nodes = trait.nodes_in_DFS();
+  DataFlowGraph dfg;
+  dfg.Build(desc);
   size_t count = 0;
-  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
-    LOG(INFO) << "visiting " << it->name();
+  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes_in_DFS()) {
+    LOG(INFO) << "visiting " << node.name();
     ++count;
   }
   ASSERT_EQ(count, dfg.nodes.size());
@@ -74,21 +71,17 @@ TEST(DataFlowGraph, TS) {
   DataFlowGraph graph;
 
   for (int i = 0; i < 8; i++) {
-    auto *node = graph.nodes.Create(Node::Type::kValue);
+    auto* node = graph.nodes.Create(Node::Type::kValue);
     node->SetName("node-" + std::to_string(i));
   }
 
   auto add_link = [&](int i, int j) {
-    Node *source = graph.nodes.GetMutable(i);
-    Node *target = graph.nodes.GetMutable(j);
+    Node* source = graph.nodes.GetMutable(i);
+    Node* target = graph.nodes.GetMutable(j);
     target->inlinks.push_back(source);
     source->outlinks.push_back(target);
   };
 
-  graph.inputs.push_back(graph.nodes.GetMutable(0));
-  graph.inputs.push_back(graph.nodes.GetMutable(1));
-  graph.inputs.push_back(graph.nodes.GetMutable(2));
-
   add_link(0, 4);
   add_link(0, 5);
   add_link(1, 6);
@@ -97,8 +90,9 @@ TEST(DataFlowGraph, TS) {
   add_link(4, 7);
   add_link(4, 3);
   add_link(7, 3);
+  graph.Build();
 
-  auto its = GraphTraits<DataFlowGraph>(&graph).nodes_in_TS();
+  auto its = GraphTraits<DataFlowGraph>(graph).nodes_in_TS();
   std::vector<int> sorted_ids;
   for (auto it = its.begin(); it != its.end(); ++it) {
     LOG(INFO) << it->name();
@@ -122,6 +116,50 @@ TEST(DataFlowGraph, TS) {
   assert_positive_sequence_pair(4, 7);
 }
 
+TEST(DataFlowGraph, Build_ProgramDesc) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
+  DataFlowGraph graph;
+  graph.Build(desc);
+  ASSERT_EQ(graph.nodes.size(), 38UL);
+}
+
+void SetOp(framework::ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetInput("Xs", inputs);
+  op->SetOutput("Xs", outputs);
+}
+
+TEST(DataFlowGraph, Build_IR_Graph) {
+  framework::ProgramDesc prog;
+  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(framework::proto::VarType::SELECTED_ROWS);
+    if (v == "c") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
+        std::vector<std::string>({"d"}));
+  SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
+        std::vector<std::string>({"f"}));
+
+  DataFlowGraph graph;
+
+  framework::ir::Graph ir_graph(prog);
+
+  graph.Build(ir_graph);
+
+  ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size());
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 18c32fa09199003f17183207828cdfe4e627ae1a..8c7dd146e429a7f5cd28bdd418e457e8ea5680bd 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -23,9 +23,6 @@
 namespace paddle {
 namespace inference {
 
-DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size");
-DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
-
 namespace analysis {
 
 using framework::proto::ProgramDesc;
@@ -52,19 +49,15 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
 bool DataFlowGraphToFluidPass::Finalize() { return true; }
 
 void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
-  FilterRedundantOutputOfSubGraph(graph);
-  LOG(INFO) << "graph.inputs " << graph->inputs.size();
-  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+  // FilterRedundantOutputOfSubGraph(graph);
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
     if (node.deleted()) continue;
 
     switch (node.type()) {
       case Node::Type::kFunction: {
-        LOG(INFO) << "add function " << node.repr();
         AddFluidOp(&node);
       } break;
       case Node::Type::kFunctionBlock: {
-        LOG(INFO) << "add engine op " << node.repr() << " , "
-                  << static_cast<FunctionBlock *>(&node)->subgraph.size();
         AddEngineOp(&node);
       } break;
       default:
@@ -76,15 +69,27 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
 }
 
 void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
-  auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
+  PADDLE_ENFORCE(node);
+  PADDLE_ENFORCE(node->IsFunction());
+  PADDLE_ENFORCE(node->pb_desc() || !node->pb_msg().empty(),
+                 "node has invalid protobuf repr.");
+
   // currently only the main block is analyzed.
+  PADDLE_ENFORCE(desc_);
   auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
   auto *op = main_block->add_ops();
-  *op = *ori_op;  // copy the attributes, by default, these will not be changed
-  // by analysis phrase.
-  // The inputs and outputs of the existing ops are not changed by tensorrt
-  // subgraph pass.
-  // NOTE It might be changed by other passes in the long run.
+
+  if (node->pb_desc()) {
+    auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
+    *op =
+        *ori_op;  // copy the attributes, by default, these will not be changed
+    // by analysis phrase.
+    // The inputs and outputs of the existing ops are not changed by tensorrt
+    // subgraph pass.
+    // NOTE It might be changed by other passes in the long run.
+  } else {
+    op->ParseFromString(node->pb_msg());
+  }
 }
 
 void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
@@ -191,8 +196,6 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
   // Set attrs
   SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
   SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
-  SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
-  SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
   SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
   SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
   node->SetPbMsg(desc.Proto()->SerializeAsString());
@@ -221,10 +224,9 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
   framework::BlockDesc block_desc(nullptr, &proto);
   block_desc.Proto()->set_parent_idx(-1);
   block_desc.Proto()->set_idx(0);
-  LOG(INFO) << "origin variable size: "
-            << argument_->origin_program_desc->blocks(0).vars().size();
-  LOG(INFO) << "transformed variable size: "
-            << block_desc.Proto()->vars().size();
+  VLOG(4) << "origin variable size: "
+          << argument_->origin_program_desc->blocks(0).vars().size();
+  VLOG(4) << "transformed variable size: " << block_desc.Proto()->vars().size();
   // copy ops.
 
   for (auto *node : block_node->subgraph) {
@@ -258,7 +260,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 
 Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_inference_analysis_graphviz_log_root,
+      FLAGS_IA_graphviz_log_root,
       "data_flow_graph_to_fluid_graphviz_debugger"));
 }
 
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index 59c47365aa6c8ad5886c4515850d264f69cc4670..0c9a8a0b7cae17bf2eaa714348ea1c9b5e43611b 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -27,9 +27,6 @@
 namespace paddle {
 namespace inference {
 
-DECLARE_int32(tensorrt_max_batchsize);
-DECLARE_int32(tensorrt_workspace_size);
-
 namespace analysis {
 class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
  public:
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
index c05b0e5d4690d0a447edf63a149903704bc2c9be..648b8f7d6a6ec4bafbad2838c5631e776c8699b1 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
 
   auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
   std::string message;
-  LOG(INFO) << "draw to " << png_path;
+  VLOG(3) << "draw to " << png_path;
   ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
 }
 
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 511631d3e067f14bc1230d9e4b4d92dbe604e1d4..51bd0ac42d455f68ac5d70f0ce9703dfad6070d4 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -52,72 +52,7 @@ bool FluidToDataFlowGraphPass::Finalize() { return true; }
 void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   PADDLE_ENFORCE(graph);
   PADDLE_ENFORCE(desc_);
-  // insert vars
-  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
-  // will keep updating to its latest alias during the graph-building.
-  std::unordered_map<std::string, size_t> var2id;
-  auto &main_block = desc_->blocks(framework::kRootBlockIndex);
-  for (int i = 0; i < main_block.vars_size(); i++) {
-    const auto &var = main_block.vars(i);
-    auto *v = graph->nodes.Create(Node::Type::kValue);
-    v->SetName(var.name());
-    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
-    v->SetPbMsg(var.SerializeAsString());
-    var2id[var.name()] = v->id();
-  }
-
-  // The variables in a SSA can only write once, so if a variable is written
-  // multiple times(quite common in our ProgramDesc design), multiple alias
-  // Nodes of this variable will be created, and each will just write once.
-
-  // An set that keep all the names of the variables(the original, not alias)
-  // that have been written(as outputs). Once an Op's output variable hit the
-  // set, it should create a new alias and update the global alias for this
-  // variable. And that make a Data Flow Graph a SSA.
-  std::unordered_set<Node *> unique_written_vars;
-  for (int i = 0; i < main_block.ops_size(); i++) {
-    const auto &op = main_block.ops(i);
-    auto *o = graph->nodes.Create(Node::Type::kFunction);
-    o->SetName(op.type());
-    static_cast<Function *>(o)->SetFuncType(op.type());
-    // Link to the original protobuf message's memory, make it easier to
-    // generate from a data flow graph to fluid ProgramDesc.
-    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
-    o->SetPbMsg(op.SerializeAsString());
-
-    // set inputs and outputs
-    for (int j = 0; j < op.inputs_size(); j++) {
-      auto &in_var = op.inputs(j);
-      for (int k = 0; k < in_var.arguments_size(); k++) {
-        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
-        in->outlinks.push_back(o);
-        o->inlinks.push_back(in);
-      }
-    }
-    for (int j = 0; j < op.outputs_size(); j++) {
-      auto &out_var = op.outputs(j);
-      for (int k = 0; k < out_var.arguments_size(); k++) {
-        auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
-        if (unique_written_vars.count(out)) {
-          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
-          auto *out_alias = graph->nodes.Create(Node::Type::kValue);
-          out_alias->SetName(out->name());
-          out_alias->SetPbDesc(out->pb_desc());
-          out_alias->SetPbMsg(out->pb_msg());
-          var2id[out_alias->name()] =
-              out_alias->id();  // update variable's alias Node
-          LOG(INFO) << "loop found in graph, create SSA alias node ["
-                    << out_alias->repr() << "] for [" << out->repr() << "]";
-          out = out_alias;
-        }
-        out->inlinks.push_back(o);
-        o->outlinks.push_back(out);
-        unique_written_vars.insert(out);
-      }
-    }
-  }
-  // Analysis and extract the inputs and outputs of this graph.
-  graph->Build();
+  graph->Build(*desc_);
 }
 
 namespace {
@@ -133,7 +68,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 
 Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
+      FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index d218dcd05015aa4636c16569de4addf4936c8cd5..267a0a84ebf75615e0b390f4a1b3bf3b51793fc7 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -30,7 +30,7 @@ TEST(FluidToDataFlowGraphPass, Test) {
   ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
   pass.Finalize();
   ASSERT_FALSE(argument.main_dfg->DotString().empty());
-  EXPECT_FALSE(argument.main_dfg->inputs.empty());
+  EXPECT_FALSE(argument.main_dfg->inputs().empty());
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..073f49752872cbb65fddc74be75ec28d4dd0bbaf
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa3f8d313bbdd6733fa3878dd7023e125b6ced36
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class FluidToIrPass final : public DataFlowGraphPass {
+ public:
+  FluidToIrPass() = default;
+
+  bool Initialize(Argument *argument) override {
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
+    if (argument->origin_program_desc) {
+      LOG(WARNING) << "argument's origin_program_desc is already set, might "
+                      "duplicate called";
+    }
+    // set fluid model program path
+    if (!argument->fluid_model_program_path) {
+      ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
+      argument->fluid_model_program_path.reset(
+          new std::string(*argument->fluid_model_dir + "/__model__"));
+    }
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
+    // Load program.
+    auto program = LoadProgramDesc(*argument->fluid_model_program_path);
+    argument->origin_program_desc.reset(
+        new framework::proto::ProgramDesc(program));
+    // Create main data flow graph.
+    if (!argument->main_dfg) {
+      argument->main_dfg.reset(new DataFlowGraph);
+    }
+    // Persist the ProgramDesc in graph's attribute. The IR graph just keep the
+    // address, will segfault if the original ProgramDesc destroys.
+    auto &ir_program_p = argument->main_dfg->Attr("ir_program_desc").Pointer();
+    ir_program_p = new framework::ProgramDesc(program);
+
+    argument_ = argument;
+    return true;
+  }
+
+  bool Finalize() override { return true; }
+
+  void Run(DataFlowGraph *graph) override {
+    // Call all the IR Passes
+    IRPassManager ir_passes(*static_cast<framework::ProgramDesc *>(
+        argument_->main_dfg->Attr("ir_program_desc").Pointer()));
+    ir_passes.Apply(std::vector<std::string>(
+        {// Manual update the passes here.
+         "graph_viz_pass", "infer_clean_graph_pass", "graph_viz_pass",
+         "fc_fuse_pass", "graph_viz_pass"}));
+
+    PADDLE_ENFORCE(argument_->main_dfg.get());
+    argument_->main_dfg->Build(ir_passes.graph());
+    // PADDLE_ENFORCE(argument_->main_dfg->IsFullyConnected());
+  }
+
+  std::string repr() const override { return "fluid-to-ir-pass"; }
+
+ private:
+  Argument *argument_{nullptr};
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af934f261baa3807059ce6ab036545594630df58
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(FluidToIrPass, Test) {
+  FluidToIrPass pass;
+  Argument argument(FLAGS_inference_model_dir);
+  pass.Initialize(&argument);
+  pass.Run(argument.main_dfg.get());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(infer_clean_graph_pass);
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index a0f912b251d5ea29594a7f601d5b2bce91201790..5151e2b69ac199dea136535ba445e890596f6227 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <sys/stat.h>
 #include <cstdio>
 #include <fstream>
 #include <string>
@@ -151,6 +152,23 @@ static framework::proto::ProgramDesc LoadProgramDesc(
   return program_desc;
 }
 
+static bool FileExists(const std::string &filepath) {
+  std::ifstream file(filepath);
+  bool exists = file.is_open();
+  file.close();
+  return exists;
+}
+
+static bool PathExists(const std::string &path) {
+  struct stat statbuf;
+  if (stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d849b637bcf3fe3944ad11680bbe041e19a71e24
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include <string>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+IRPassManager::IRPassManager(const ProgramDesc& program) {
+  graph_.reset(new framework::ir::Graph(program));
+}
+
+void IRPassManager::Apply(const std::vector<std::string>& passes) {
+  graph_->Set("graph_viz_path", new std::string("./1.dot"));
+  // Apply all the passes
+  std::string pre_pass;
+  for (const std::string& pass_name : passes) {
+    LOG(WARNING) << "Running IR pass [" << pass_name << "]";
+    auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
+    if (pass_name == "graph_viz_pass") {
+      std::string dot_file_path =
+          "ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot";
+      pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
+    }
+    graph_ = pass->Apply(std::move(graph_));
+    pre_pass = pass_name;
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..3338e37ecf1c591a631fd829a05b07e562af703e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines IRPassManager, it helps control the passes in IR. Inference
+ * phrase will load the model program and parameters from disk, that is quite
+ * different from the training phase.
+ * This manager will control the Passes and make the passes in IR work smoothly
+ * for inference.
+ */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ProgramDesc;
+
+class IRPassManager final {
+ public:
+  IRPassManager(const ProgramDesc& program);
+
+  void Apply(const std::vector<std::string>& passes);
+
+  framework::ir::Graph& graph() const { return *graph_; }
+
+ private:
+  std::unique_ptr<framework::ir::Graph> graph_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc
index 1c429176424bd5c1d8fa5e015c19d698f966880e..c313db08875669010ddcca13aa66b383ee6d26f8 100644
--- a/paddle/fluid/inference/analysis/model_store_pass.cc
+++ b/paddle/fluid/inference/analysis/model_store_pass.cc
@@ -35,19 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) {
   std::stringstream ss;
   // NOTE these commands only works on linux.
   ss << "mkdir -p " << *argument_->model_output_store_path;
-  LOG(INFO) << "run command: " << ss.str();
+  VLOG(3) << "run command: " << ss.str();
   PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
   ss.str("");
 
   ss << "cp " << *argument_->fluid_model_dir << "/*"
      << " " << *argument_->model_output_store_path;
-  LOG(INFO) << "run command: " << ss.str();
+  VLOG(3) << "run command: " << ss.str();
   PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
 
   // Store program
   PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
                           "program desc is not transformed, should call "
                           "DataFlowGraphToFluidPass first.");
+  VLOG(3) << "store analyzed program to "
+          << *argument_->model_output_store_path;
   const std::string program_output_path =
       *argument_->model_output_store_path + "/__model__";
   std::ofstream file(program_output_path, std::ios::binary);
@@ -58,6 +60,8 @@ void ModelStorePass::Run(DataFlowGraph *x) {
   file.write(serialized_message.c_str(), serialized_message.size());
 }
 
+bool ModelStorePass::Finalize() { return true; }
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h
index fac7083925776b6209d49255c9e67b930cb1250b..3a2869e30bd80cfd0756f8e21acb414656620eaa 100644
--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -44,6 +44,8 @@ class ModelStorePass : public DataFlowGraphPass {
     model in the disk, and that model can be reloaded for prediction again.)DD";
   }
 
+  bool Finalize() override;
+
  private:
   Argument* argument_{nullptr};
 };
diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
index 5f3526dd504e77e58d79b4f675db86a22fd0f26b..d6493fc25edf25003504542f1b01c4105754c8df 100644
--- a/paddle/fluid/inference/analysis/model_store_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
@@ -30,7 +30,7 @@ TEST(DFG_StorePass, test) {
   argument.model_output_store_path.reset(
       new std::string("./_dfg_store_pass_tmp"));
   // disable storage in alalyzer
-  FLAGS_inference_analysis_output_storage_path = "";
+  FLAGS_IA_output_storage_path = "";
   analyzer.Run(&argument);
 
   ModelStorePass pass;
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
index f2e918f3ff41d9db0c3ec38561015967bed26f4e..3339b5044df0cf91d00aa9ddad310d4bf263bc3c 100644
--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -20,17 +20,6 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-template <>
-std::string &NodeAttr::As<std::string>() {
-  if (data_.empty()) {
-    type_index_ = std::type_index(typeid(std::string));
-  }
-  PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string)));
-  return data_;
-}
-
-std::string &NodeAttr::String() { return As<std::string>(); }
-
 std::vector<Dot::Attr> Value::dot_attrs() const {
   return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
                                  Dot::Attr("shape", "box"),
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
index 47e524bc5c4a6b1324d5f182053129311487522d..af34156bc2f101465d87cb10e2155745022eb521 100644
--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/device.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace inference {
@@ -37,41 +38,36 @@ namespace analysis {
 class NodeMap;
 
 // A helper class to maintain the status from Pass.
-struct NodeAttr {
+struct AnyAttr {
+  using any_t =
+      boost::variant<bool, float, int32_t, int64_t, void *, std::string>;
   // NOTE T should be a primary type or a struct combined by several primary
   // types.
   // NOTE the STL containers should not use here.
   // Some usages
   //   Attr attr;
   //   attr.Bool() = true;
-
   bool &Bool() { return As<bool>(); }
   float &Float() { return As<float>(); }
   int32_t &Int32() { return As<int32_t>(); }
   int64_t &Int64() { return As<int64_t>(); }
   void *&Pointer() { return As<void *>(); }
-  std::string &String();
+  std::string &String() { return As<std::string>(); }
 
- private:
   template <typename T>
   T &As() {
-    // init storage in the first usage.
-    if (data_.empty()) {
-      VLOG(4) << "resize data to " << sizeof(T);
-      type_index_ = std::type_index(typeid(T));
-      data_.resize(sizeof(T));
+    if (type_index_ == typeid(AnyAttr)) {
+      type_index_ = typeid(T);
+      any_data_ = T();
+    } else {
+      PADDLE_ENFORCE(type_index_ == typeid(T), "fetch error type");
     }
-    PADDLE_ENFORCE(framework::IsType<T>(type_index_),
-                   "type not matched, origin is %s, want %s",
-                   DataTypeNamer::Global().repr(type_index_),
-                   DataTypeNamer::Global().repr<T>());
-    PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
-    return *reinterpret_cast<T *>(&data_[0]);
+    return boost::get<T>(any_data_);
   }
 
  private:
-  std::string data_;
-  std::type_index type_index_{typeid(NodeAttr)};
+  any_t any_data_;
+  std::type_index type_index_{typeid(AnyAttr)};
 };
 
 /*
@@ -108,7 +104,7 @@ class Node {
 
   // Get an additional attribute and convert it to T data type. NOTE this will
   // silently create a new attribute if not exists.
-  NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
+  AnyAttr &attr(const std::string &name) const { return attrs_[name]; }
 
   int id() const { return id_; }
 
@@ -153,7 +149,7 @@ class Node {
   Type type_{Type::kNone};
   // Mark this node is deleted by some pass.
   bool deleted_{false};
-  mutable std::unordered_map<std::string, NodeAttr> attrs_;
+  mutable std::unordered_map<std::string, AnyAttr> attrs_;
 };
 
 class Function;
diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc
index ea832a3a7e47758be9b6bd59a4325ddb576ec446..9207c15373fb4264ff0e738e93ae88e1c08b554c 100644
--- a/paddle/fluid/inference/analysis/node_tester.cc
+++ b/paddle/fluid/inference/analysis/node_tester.cc
@@ -20,6 +20,24 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+TEST(NodeAttr, bool) {
+  AnyAttr x;
+  x.Bool() = true;
+  ASSERT_EQ(x.Bool(), true);
+}
+
+TEST(NodeAttr, int32) {
+  AnyAttr x;
+  x.Int32() = 32;
+  ASSERT_EQ(x.Int32(), 32);
+}
+
+TEST(NodeAttr, string) {
+  AnyAttr x;
+  x.String() = "Hello";
+  ASSERT_EQ(x.String(), "Hello");
+}
+
 TEST(Node, Attr) {
   // Node is an abstract class, use Value instead for they share the same Attr
   // logic.
@@ -27,6 +45,9 @@ TEST(Node, Attr) {
   auto* node = nodes.Create(Node::Type::kValue);
   node->attr("v0").Int32() = 2008;
   ASSERT_EQ(node->attr("v0").Int32(), 2008);
+
+  node->attr("str").String() = "hello world";
+  ASSERT_EQ(node->attr("str").String(), "hello world");
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 6806f9ff7dada2c1e2328e1ffbfd225afefcf474..7719c6f5ff3c940948c7bdbcb25513cdf430281b 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -63,7 +63,7 @@ class Pass {
   // Human-readable short representation.
   virtual std::string repr() const = 0;
   // Human-readable long description.
-  virtual std::string description() const = 0;
+  virtual std::string description() const { return "No DOC"; }
 };
 
 // NodePass process on any Node types.
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
index b428bb22b1f0c5c1a47fc4c46c9070c1ace4a228..cfdca33882ea00a28e3ea51ca5fd77ec9605bf3a 100644
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -22,7 +22,7 @@ namespace analysis {
 bool PassManager::Initialize(Argument* argument) {
   argument_ = argument;
   for (auto& pass : data_) {
-    LOG(INFO) << "Initializing pass " << pass->repr();
+    LOG(WARNING) << "Initializing pass [" << pass->repr() << "]";
     if (!pass->Initialize(argument)) {
       LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
       return false;
@@ -33,8 +33,9 @@ bool PassManager::Initialize(Argument* argument) {
 
 void DfgPassManager::RunAll() {
   PADDLE_ENFORCE(argument_);
+  LOG(INFO) << "Total " << data_.size() << " passes";
   for (auto& pass : data_) {
-    VLOG(4) << "Running pass [" << pass->repr() << "]";
+    LOG(WARNING) << "Running pass [" << pass->repr() << "]";
     pass->Run(argument_->main_dfg.get());
   }
 }
@@ -42,8 +43,7 @@ void DfgPassManager::RunAll() {
 void NodePassManager::RunAll() {
   PADDLE_ENFORCE(argument_);
   PADDLE_ENFORCE(argument_->main_dfg.get());
-  auto trait =
-      GraphTraits<DataFlowGraph>(argument_->main_dfg.get()).nodes_in_DFS();
+  auto trait = GraphTraits<DataFlowGraph>(*argument_->main_dfg).nodes_in_DFS();
   for (auto& node : trait) {
     for (auto& pass : data_) {
       pass->Run(&node);
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 80809d4c43ca08298bad25cf614dcb4117d3f99a..670a8de667494c655bed15aa3e4ce8265448635a 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -34,7 +34,7 @@ inline void MarkOutLinksInSubGraph(const Function *func) {
 }
 
 void SubGraphSplitter::MarkNodesInsideSubGraph() {
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes()) {
     if (node_inside_subgraph_teller_(&node)) {
       node.attr(kMarkerAttrName).Bool() = true;
       if (node.type() == Node::Type::kFunction) {
@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
 
 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
   std::vector<Node *> marked_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes_in_TS()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
     if (node.attr(kMarkerAttrName).Bool()) {
       marked_nodes.push_back(&node);
     }
@@ -153,6 +153,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
       inlink_or_outlink_cleaner(o->inlinks);
     }
   }
+  FilterRedundantOutputOfSubGraph(graph_);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
index f736e385c11add152dc9ab9485bf1de40f80b2f3..9f51fafe0b2a66f9d062a6b751fe7a3bc662ce7c 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -69,8 +69,8 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
 };
 
 Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
-  DFG_GraphvizDrawPass::Config config(
-      FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
+  DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
+                                      "tensorrt_marked_node");
   return new DfgDebuggerPass(config);
 }
 bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index a72e27d651d0591815a9d93354d2aea8aa216de6..0ca1af455ca10fa6995ad3a1c33825108a3fd7ad 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,7 +18,10 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api)
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
+  graph_viz_pass fc_fuse_pass
+    infer_clean_graph_pass
+  )
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
@@ -62,17 +65,20 @@ endif()
 
 if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
     # compile the libinference_anakin_api.a and anakin.so.
-    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
-    #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
     function(anakin_target target_name)
       target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     endfunction()
     anakin_target(inference_anakin_api)
-    #anakin_target(inference_anakin_api_shared)
+    anakin_target(inference_anakin_api_shared)
     if (WITH_TESTING)
-        cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
+        cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
                 ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
-                DEPS inference_anakin_api dynload_cuda SERIAL)
-        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+        cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc 
+                ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
+                     --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
+                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
     endif(WITH_TESTING)
 endif()
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 63c3f0d7b3f5c2b9246e2b041796caf5eb562826..5f1e1b548c7b7daa66932571d7053701bc0bd1f6 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
 http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
index 6b374ceefbc180a5c22abe591f12e1c3d89bc64a..ea66aa89b87ba3c25cdcd5eb2c5155a481ef7987 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -13,9 +13,22 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/api_anakin_engine.h"
+
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
+#endif
+
+#include <mkl_service.h>
+#include <omp.h>
+#include <map>
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "framework/core/net/net.h"
+#include "framework/operators/ops.h"
+#include "saber/funcs/timer.h"
+
 namespace paddle {
 
 template <typename Target>
@@ -23,16 +36,24 @@ PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
     const AnakinConfig &config) {
   CHECK(Init(config));
 }
-
+template <>
+PaddleInferenceAnakinPredictor<anakin::X86>::PaddleInferenceAnakinPredictor(
+    const AnakinConfig &config) {
+  omp_set_dynamic(0);
+  omp_set_num_threads(1);
+  mkl_set_num_threads(1);
+  CHECK(Init(config));
+}
 template <typename Target>
 bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
   if (!(graph_.load(config.model_file))) {
-    LOG(FATAL) << "fail to load graph from " << config.model_file;
+    VLOG(3) << "fail to load graph from " << config.model_file;
     return false;
   }
   auto inputs = graph_.get_ins();
   for (auto &input_str : inputs) {
     graph_.ResetBatchSize(input_str, config.max_batch_size);
+    max_batch_size_ = config.max_batch_size;
   }
   // optimization for graph
   if (!(graph_.Optimize())) {
@@ -52,15 +73,15 @@ bool PaddleInferenceAnakinPredictor<Target>::Run(
     std::vector<PaddleTensor> *output_data, int batch_size) {
   for (const auto &input : inputs) {
     if (input.dtype != PaddleDType::FLOAT32) {
-      LOG(ERROR) << "Only support float type inputs. " << input.name
-                 << "'s type is not float";
+      VLOG(3) << "Only support float type inputs. " << input.name
+              << "'s type is not float";
       return false;
     }
     auto d_tensor_in_p = executor_p_->get_in(input.name);
-    auto net_shape = d_tensor_in_p->valid_shape();
+    auto net_shape = d_tensor_in_p->shape();
     if (net_shape.size() != input.shape.size()) {
-      LOG(ERROR) << " input  " << input.name
-                 << "'s shape size should be equal to that of net";
+      VLOG(3) << " input  " << input.name
+              << "'s shape size should be equal to that of net";
       return false;
     }
     int sum = 1;
@@ -79,21 +100,45 @@ bool PaddleInferenceAnakinPredictor<Target>::Run(
     }
     d_tensor_in_p->reshape(tmp_shape);
 
+    if (input.lod.size() > 0) {
+      if (input.lod.size() > 1) {
+        VLOG(3) << " input lod first dim should <=1, but you set "
+                << input.lod.size();
+        return false;
+      }
+      std::vector<int> offset(input.lod[0].begin(), input.lod[0].end());
+      d_tensor_in_p->set_seq_offset(offset);
+      VLOG(3) << "offset.size(): " << offset.size();
+      for (int i = 0; i < offset.size(); i++) {
+        VLOG(3) << offset[i];
+      }
+    }
+
     float *d_data_p = d_tensor_in_p->mutable_data();
-    if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
-                   d_tensor_in_p->valid_size() * sizeof(float),
-                   cudaMemcpyHostToDevice) != 0) {
-      LOG(ERROR) << "copy data from CPU to GPU error";
-      return false;
+
+#ifdef PADDLE_WITH_CUDA
+    if (std::is_same<anakin::NV, Target>::value) {
+      if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
+                     d_tensor_in_p->valid_size() * sizeof(float),
+                     cudaMemcpyHostToDevice) != 0) {
+        VLOG(3) << "copy data from CPU to GPU error";
+        return false;
+      }
+    }
+#endif
+    if (std::is_same<anakin::X86, Target>::value) {
+      memcpy(d_data_p, static_cast<float *>(input.data.data()),
+             d_tensor_in_p->valid_size() * sizeof(float));
     }
-    cudaStreamSynchronize(NULL);
   }
+#ifdef PADDLE_WITH_CUDA
   cudaDeviceSynchronize();
   executor_p_->prediction();
   cudaDeviceSynchronize();
+#endif
 
   if (output_data->empty()) {
-    LOG(ERROR) << "At least one output should be set with tensors' names.";
+    VLOG(3) << "At least one output should be set with tensors' names.";
     return false;
   }
   for (auto &output : *output_data) {
@@ -102,14 +147,22 @@ bool PaddleInferenceAnakinPredictor<Target>::Run(
     if (output.data.length() < tensor->valid_size() * sizeof(float)) {
       output.data.Resize(tensor->valid_size() * sizeof(float));
     }
-    // Copy data from GPU -> CPU
-    if (cudaMemcpy(output.data.data(), tensor->mutable_data(),
-                   tensor->valid_size() * sizeof(float),
-                   cudaMemcpyDeviceToHost) != 0) {
-      LOG(ERROR) << "copy data from GPU to CPU error";
-      return false;
+
+#if PADDLE_WITH_CUDA
+    if (std::is_same<anakin::NV, Target>::value) {
+      // Copy data from GPU -> CPU
+      if (cudaMemcpy(output.data.data(), tensor->mutable_data(),
+                     tensor->valid_size() * sizeof(float),
+                     cudaMemcpyDeviceToHost) != 0) {
+        VLOG(3) << "copy data from GPU to CPU error";
+        return false;
+      }
+    }
+#endif
+    if (std::is_same<anakin::X86, Target>::value) {
+      memcpy(output.data.data(), tensor->mutable_data(),
+             tensor->valid_size() * sizeof(float));
     }
-    cudaStreamSynchronize(NULL);
   }
   return true;
 }
@@ -132,7 +185,7 @@ PaddleInferenceAnakinPredictor<Target>::Clone() {
   auto anakin_predictor_p =
       dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
   if (!anakin_predictor_p) {
-    LOG(ERROR) << "fail to call Init";
+    VLOG(3) << "fail to call Init";
     return nullptr;
   }
   anakin_predictor_p->get_executer().init(graph_);
@@ -162,6 +215,44 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     VLOG(3) << "Anakin Predictor create on unknown platform.";
     return nullptr;
   }
-};
+}
+
+#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
+template <typename Target>
+using executor_t =
+    anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>;
+
+template <typename Target>
+void DisplayOpTimer(executor_t<Target> *net_executor, int epoch) {
+  std::vector<float> op_time = net_executor->get_op_time();
+  auto exec_funcs = net_executor->get_exec_funcs();
+  auto op_param = net_executor->get_op_param();
+  for (int i = 0; i < op_time.size(); i++) {
+    LOG(INFO) << "name: " << exec_funcs[i].name
+              << " op_type: " << exec_funcs[i].op_name
+              << " op_param: " << op_param[i] << " time " << op_time[i] / epoch;
+  }
+  std::map<std::string, float> op_map;
+  for (int i = 0; i < op_time.size(); i++) {
+    auto it = op_map.find(op_param[i]);
+    if (it != op_map.end())
+      op_map[op_param[i]] += op_time[i];
+    else
+      op_map.insert(std::pair<std::string, float>(op_param[i], op_time[i]));
+  }
+  for (auto it = op_map.begin(); it != op_map.end(); ++it) {
+    LOG(INFO) << it->first << "  " << (it->second) / epoch << " ms";
+  }
+}
+#endif
+
+template <typename Target>
+PaddleInferenceAnakinPredictor<Target>::~PaddleInferenceAnakinPredictor() {
+#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
+  DisplayOpTimer<Target>(executor_p_, max_batch_size_);
+#endif
+  delete executor_p_;
+  executor_p_ = nullptr;
+}
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index 836badd9799228c6c294dcad5df73d039d36a1ff..dd08661880d8cc3a9f4401e9af91a3d10e6579b6 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -47,10 +47,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
   anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
   get_executer();
 
-  ~PaddleInferenceAnakinPredictor() override {
-    delete executor_p_;
-    executor_p_ = nullptr;
-  };
+  ~PaddleInferenceAnakinPredictor() override;
 
  private:
   bool Init(const AnakinConfig& config);
@@ -60,6 +57,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
   anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
       executor_p_{nullptr};
   AnakinConfig config_;
+  int max_batch_size_{0};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6183864234e85b89e94821890d9606b082c59233
--- /dev/null
+++ b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
@@ -0,0 +1,315 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <sys/time.h>
+#include <time.h>
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <thread>  // NOLINT
+#include <vector>
+#include "framework/core/net/net.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(datapath, "", "Path of the dataset.");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+
+// Timer for timer
+class Timer {
+ public:
+  double start;
+  double startu;
+  void tic() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    start = tp.tv_sec;
+    startu = tp.tv_usec;
+  }
+  double toc() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    double used_time_ms =
+        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
+    return used_time_ms;
+  }
+};
+
+std::vector<std::string> string_split(std::string in_str,
+                                      std::string delimiter) {
+  std::vector<std::string> seq;
+  int found = in_str.find(delimiter);
+  int pre_found = -1;
+  while (found != std::string::npos) {
+    if (pre_found == -1) {
+      seq.push_back(in_str.substr(0, found));
+    } else {
+      seq.push_back(in_str.substr(pre_found + delimiter.length(),
+                                  found - delimiter.length() - pre_found));
+    }
+    pre_found = found;
+    found = in_str.find(delimiter, pre_found + delimiter.length());
+  }
+  seq.push_back(
+      in_str.substr(pre_found + 1, in_str.length() - (pre_found + 1)));
+  return seq;
+}
+std::vector<std::string> string_split(
+    std::string in_str, std::vector<std::string>& delimiter) {  // NOLINT
+  std::vector<std::string> in;
+  std::vector<std::string> out;
+  out.push_back(in_str);
+  for (auto del : delimiter) {
+    in = out;
+    out.clear();
+    for (auto s : in) {
+      auto out_s = string_split(s, del);
+      for (auto o : out_s) {
+        out.push_back(o);
+      }
+    }
+  }
+  return out;
+}
+
+class Data {
+ public:
+  Data(std::string file_name, int batch_size)
+      : _batch_size(batch_size), _total_length(0) {
+    _file.open(file_name);
+    _file.seekg(_file.end);
+    _total_length = _file.tellg();
+    _file.seekg(_file.beg);
+  }
+  void get_batch_data(std::vector<std::vector<float>>& fea,         // NOLINT
+                      std::vector<std::vector<float>>& week_fea,    // NOLINT
+                      std::vector<std::vector<float>>& time_fea,    // NOLINT
+                      std::vector<long unsigned int>& seq_offset);  // NOLINT
+
+ private:
+  std::fstream _file;
+  int _total_length;
+  int _batch_size;
+};
+
+void Data::get_batch_data(
+    std::vector<std::vector<float>>& fea,          // NOLINT
+    std::vector<std::vector<float>>& week_fea,     // NOLINT
+    std::vector<std::vector<float>>& time_fea,     // NOLINT
+    std::vector<long unsigned int>& seq_offset) {  // NOLINT
+  int seq_num = 0;
+  long unsigned int cum = 0;  // NOLINT
+
+  char buf[10000];
+  seq_offset.clear();
+  seq_offset.push_back(0);
+  fea.clear();
+  week_fea.clear();
+  time_fea.clear();
+  while (_file.getline(buf, 10000)) {
+    std::string s = buf;
+    std::vector<std::string> deli_vec = {":"};
+    std::vector<std::string> data_vec = string_split(s, deli_vec);
+
+    std::vector<std::string> seq;
+    seq = string_split(data_vec[0], {"|"});
+
+    for (auto link : seq) {
+      std::vector<std::string> data = string_split(link, ",");
+      std::vector<float> vec;
+      for (int i = 0; i < data.size(); i++) {
+        vec.push_back(atof(data[i].c_str()));
+      }
+      fea.push_back(vec);
+    }
+    std::vector<std::string> week_data;
+    std::vector<std::string> time_data;
+
+    week_data = string_split(data_vec[2], ",");
+    std::vector<float> vec_w;
+    for (int i = 0; i < week_data.size(); i++) {
+      vec_w.push_back(atof(week_data[i].c_str()));
+    }
+    week_fea.push_back(vec_w);
+
+    time_data = string_split(data_vec[1], ",");
+    std::vector<float> vec_t;
+    for (int i = 0; i < time_data.size(); i++) {
+      vec_t.push_back(atof(time_data[i].c_str()));
+    }
+    time_fea.push_back(vec_t);
+
+    cum += seq.size();
+    seq_offset.push_back(cum);
+
+    seq_num++;
+    if (seq_num >= _batch_size) {
+      break;
+    }
+  }
+}
+
+namespace paddle {
+
+AnakinConfig GetConfig() {
+  AnakinConfig config;
+  // using AnakinConfig::X86 if you need to use cpu to do inference
+  config.target_type = AnakinConfig::X86;
+  config.model_file = FLAGS_model;
+  config.device = 0;
+  config.max_batch_size = 1000;  // the max number of token
+  return config;
+}
+
+void set_tensor(std::string name, std::vector<int> shape,
+                std::vector<PaddleTensor>& vec) {  // NOLINT
+  int sum = 1;
+  std::for_each(shape.begin(), shape.end(), [&](int n) { sum *= n; });
+  float* data = new float[sum];
+  PaddleTensor tensor;
+  tensor.name = name;
+  tensor.shape = shape;
+  tensor.data = PaddleBuf(data, sum);
+  tensor.dtype = PaddleDType::FLOAT32;
+  vec.push_back(tensor);
+}
+
+void single_test() {
+  AnakinConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+
+  int max_batch_size = 1000;
+  std::string feature_file = FLAGS_datapath;
+  Data map_data(feature_file, FLAGS_batch_size);
+  std::vector<std::vector<float>> fea;
+  std::vector<std::vector<float>> week_fea;
+  std::vector<std::vector<float>> time_fea;
+  std::vector<long unsigned int> seq_offset;  // NOLINT
+
+  paddle::PaddleTensor tensor_0, tensor_1, tensor_2;
+  tensor_0.name = "input_0";
+  tensor_1.name = "input_4";
+  tensor_2.name = "input_5";
+
+  PaddleTensor tensor_out;
+  tensor_out.name = "final_output.tmp_1_gout";
+  tensor_out.shape = std::vector<int>({});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
+
+  std::vector<PaddleTensor> inputs;
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+
+  int data_0_dim = 38;
+  int data_1_dim = 10;
+  int data_2_dim = 10;
+  float data_0[max_batch_size * data_0_dim];  // NOLINT
+  float data_1[max_batch_size * data_1_dim];  // NOLINT
+  float data_2[max_batch_size * data_2_dim];  // NOLINT
+
+  int count = 0;
+  while (true) {
+    if (count++ > 0) break;  // only run the first batch in ci.
+    seq_offset.clear();
+    map_data.get_batch_data(fea, week_fea, time_fea, seq_offset);
+    if (seq_offset.size() <= 1) {
+      LOG(FATAL) << "seq_offset.size() <= 1, exit.";
+      break;
+    }
+
+    std::vector<std::vector<long unsigned int>> seq_offset_vec;  // NOLINT
+    seq_offset_vec.push_back(seq_offset);
+    tensor_0.lod = seq_offset_vec;
+
+    int p_shape_0[] = {(int)fea.size(), 1, 1, data_0_dim};       // NOLINT
+    int p_shape_1[] = {(int)week_fea.size(), data_1_dim, 1, 1};  // NOLINT
+    int p_shape_2[] = {(int)time_fea.size(), data_2_dim, 1, 1};  // NOLINT
+
+    std::vector<int> shape_0(p_shape_0, p_shape_0 + 4);
+    std::vector<int> shape_1(p_shape_1, p_shape_1 + 4);
+    std::vector<int> shape_2(p_shape_2, p_shape_2 + 4);
+
+    tensor_0.shape = shape_0;
+    tensor_1.shape = shape_1;
+    tensor_2.shape = shape_2;
+
+    for (int i = 0; i < fea.size(); i++) {
+      memcpy(data_0 + i * data_0_dim, &fea[i][0], sizeof(float) * data_0_dim);
+    }
+    for (int i = 0; i < week_fea.size(); i++) {
+      memcpy(data_1 + i * data_1_dim, &week_fea[i][0],
+             sizeof(float) * data_1_dim);
+    }
+    for (int i = 0; i < time_fea.size(); i++) {
+      memcpy(data_2 + i * data_2_dim, &time_fea[i][0],
+             sizeof(float) * data_2_dim);
+    }
+
+    tensor_0.data =
+        paddle::PaddleBuf(data_0, fea.size() * sizeof(float) * data_0_dim);
+    tensor_1.data =
+        paddle::PaddleBuf(data_1, week_fea.size() * sizeof(float) * data_1_dim);
+    tensor_2.data =
+        paddle::PaddleBuf(data_2, time_fea.size() * sizeof(float) * data_2_dim);
+
+    tensor_0.dtype = paddle::PaddleDType::FLOAT32;
+    tensor_1.dtype = paddle::PaddleDType::FLOAT32;
+    tensor_2.dtype = paddle::PaddleDType::FLOAT32;
+
+    inputs.clear();
+    inputs.push_back(tensor_1);
+    inputs.push_back(tensor_2);
+    inputs.push_back(tensor_0);
+
+    Timer timer;
+    timer.tic();
+    for (int i = 0; i < FLAGS_repeat; i++) predictor->Run(inputs, &outputs);
+
+    LOG(INFO) << "batch_size = " << FLAGS_batch_size
+              << ", repeat = " << FLAGS_repeat
+              << ", sequence_length = " << seq_offset[seq_offset.size() - 1]
+              << ", latency: " << timer.toc() / FLAGS_repeat << "ms";
+
+    float* data_o = static_cast<float*>(outputs[0].data.data());
+    VLOG(3) << "outputs[0].data.length() = " << outputs[0].data.length();
+    for (size_t j = 0; j < outputs[0].data.length(); ++j) {
+      VLOG(3) << "output[" << j << "]: " << data_o[j];
+    }
+  }
+}
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  logger::init(argv[0]);
+
+  paddle::single_test();
+  /* multi-threads
+  std::vector<std::thread> threads;
+  int num = 1;
+  for (int i = 0; i < num; i++) {
+    LOG(INFO) << " thread id : " << i;
+    threads.emplace_back(paddle::single_test);
+  }
+  for (int i = 0; i < num; i++) {
+    threads[i].join();
+  }
+  threads.clear();
+  */
+
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index e31c637e969f7a86f4f185abb4f0f01d3303db75..32a691b81ffc0586a07f4f06d2114fa5da2e18e2 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -137,8 +137,11 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   for (size_t i = 0; i < feed_target_names_.size(); ++i) {
-    VLOG(4) << "setting " << i << "-th target";
-    feed_targets[feed_target_names_[i]] = &feeds[i];
+    if (config_.specify_input_name) {
+      feed_targets[inputs[i].name] = &feeds[i];
+    } else {
+      feed_targets[feed_target_names_[i]] = &feeds[i];
+    }
   }
   // get fetch variable
   std::map<std::string, framework::LoDTensor *> fetch_targets;
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index 45b5a7638b7dc6a54bbd905766fd5c284cb6aea1..9ac037297167fe7de29925ffe36f4d39efb65313 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/operators/tensorrt_engine_op.h"
 
@@ -32,7 +33,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
 
   bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
     VLOG(3) << "Predictor::init()";
-
+    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
+    FLAGS_tensorrt_workspace_size = config_.workspace_size;
     if (config_.use_gpu) {
       place_ = paddle::platform::CUDAPlace(config_.device);
     } else {
@@ -150,3 +152,12 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
 }
 
 }  // namespace paddle
+
+USE_TRT_CONVERTER(elementwise_add_weight);
+USE_TRT_CONVERTER(mul);
+USE_TRT_CONVERTER(conv2d);
+USE_TRT_CONVERTER(relu);
+USE_TRT_CONVERTER(fc);
+USE_TRT_CONVERTER(pool2d);
+USE_TRT_CONVERTER(softmax);
+USE_TRT_CONVERTER(batch_norm);
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
index fcbf9b89d608e7961e3ef81ac1c70e083dae1cc0..8f1a72316d6c146ebc9a86ced739ef088a3b4267 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -23,7 +23,7 @@ namespace paddle {
 DEFINE_string(dirname, "", "Directory of the inference model.");
 
 void CompareTensorRTWithFluid(bool enable_tensorrt) {
-  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = enable_tensorrt;
 
   //# 1. Create PaddlePredictor with a config.
   NativeConfig config0;
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 3e829dd726b132844a45427b7b0b39eedf197496..7824ef2649af81a2390ff3bc537eb7c93c70e402 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -13,16 +13,22 @@ else
   use_gpu_list='false'
 fi
 
+PREFIX=inference-vis-demos%2F
+URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
+
 # download vis_demo data
 function download() {
   dir_name=$1
   mkdir -p $dir_name
   cd $dir_name
-  wget -q ${URL_ROOT}$dir_name.tar.gz
-  tar xzf *.tar.gz
+  if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then
+    echo "${PREFIX}{dir_name}.tar.gz has been downloaded."
+  else
+      wget -q ${URL_ROOT}$dir_name.tar.gz
+      tar xzf *.tar.gz
+  fi
   cd ..
 }
-URL_ROOT=http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F
 mkdir -p data
 cd data
 vis_demo_list='se_resnext50 ocr mobilenet'
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c166cc0622f68e6d527005795c21236ccf43c33
--- /dev/null
+++ b/paddle/fluid/inference/api/helper.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <sys/time.h>
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+namespace inference {
+
+// Timer for timer
+class Timer {
+ public:
+  double start;
+  double startu;
+  void tic() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    start = tp.tv_sec;
+    startu = tp.tv_usec;
+  }
+  double toc() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    double used_time_ms =
+        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
+    return used_time_ms;
+  }
+};
+
+void split(const std::string &str, char sep, std::vector<std::string> *pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+void split_to_float(const std::string &str, char sep, std::vector<float> *fs) {
+  std::vector<std::string> pieces;
+  split(str, sep, &pieces);
+  std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs),
+                 [](const std::string &v) { return std::stof(v); });
+}
+template <typename T>
+std::string to_string(const std::vector<T> &vec) {
+  std::stringstream ss;
+  for (const auto &c : vec) {
+    ss << c << " ";
+  }
+  return ss.str();
+}
+template <>
+std::string to_string<std::vector<float>>(
+    const std::vector<std::vector<float>> &vec) {
+  std::stringstream ss;
+  for (const auto &piece : vec) {
+    ss << to_string(piece) << "\n";
+  }
+  return ss.str();
+}
+template <>
+std::string to_string<std::vector<std::vector<float>>>(
+    const std::vector<std::vector<std::vector<float>>> &vec) {
+  std::stringstream ss;
+  for (const auto &line : vec) {
+    for (const auto &rcd : line) {
+      ss << to_string(rcd) << ";\t";
+    }
+    ss << '\n';
+  }
+  return ss.str();
+}
+// clang-format off
+void TensorAssignData(PaddleTensor *tensor, const std::vector<std::vector<float>> &data) {
+  // Assign buffer
+  int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, [](int a, int b) { return a * b; });
+  tensor->data.Resize(sizeof(float) * dim);
+  int c = 0;
+  for (const auto &f : data) {
+    for (float v : f) { static_cast<float *>(tensor->data.data())[c++] = v; }
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/high_level_api_cn.md b/paddle/fluid/inference/api/high_level_api_cn.md
index 2fb914592cbcb1b0c3f2ef33ff9cf4c295e427b6..442c598978c700f4c438b365b8900db5b65bc5ec 100644
--- a/paddle/fluid/inference/api/high_level_api_cn.md
+++ b/paddle/fluid/inference/api/high_level_api_cn.md
@@ -65,13 +65,13 @@ config.model_dir = "xxx";
 config.use_gpu = false;
 // 创建一个原生的 PaddlePredictor
 auto predictor =
-      paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+      paddle::CreatePaddlePredictor<paddle::NativeConfig, paddle::PaddleEngineKind::kNative>(config);
 // 创建输入 tensor
 int64_t data[4] = {1, 2, 3, 4};
 paddle::PaddleTensor tensor{.name = "",
                             .shape = std::vector<int>({4, 1}),
-                            .data = PaddleBuf(data, sizeof(data)),
-                            .dtype = PaddleDType::INT64};
+                            .data = paddle::PaddleBuf(data, sizeof(data)),
+                            .dtype = paddle::PaddleDType::INT64};
 // 创建输出 tensor，输出 tensor 的内存可以复用
 std::vector<paddle::PaddleTensor> outputs;
 // 执行预测
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 794534467be066e91db2b4c204913ab2cf12dbfd..36fd0727aa7beef4a06a5f2e63ec0c43583ddf84 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -45,7 +45,7 @@ class PaddleBuf {
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
   // Own memory.
-  explicit PaddleBuf(size_t length)
+  PaddleBuf(size_t length)
       : data_(new char[length]), length_(length), memory_owned_(true) {}
   // Resize to `length` bytes.
   void Resize(size_t length);
@@ -70,7 +70,7 @@ struct PaddleTensor {
   std::vector<int> shape;
   PaddleBuf data;  // blob of data.
   PaddleDType dtype;
-  std::vector<std::vector<uint64_t>> lod;  // lod data
+  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
 };
 
 enum class PaddleEngineKind {
@@ -120,6 +120,8 @@ struct NativeConfig : public PaddlePredictor::Config {
   bool use_gpu{false};
   int device{0};
   float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+  // Specify the variable's name of each input.
+  bool specify_input_name{false};
 
   std::string prog_file;
   std::string param_file;
@@ -137,6 +139,14 @@ struct AnakinConfig : public PaddlePredictor::Config {
 struct TensorRTConfig : public NativeConfig {
   // Determine whether a subgraph will be executed by TRT.
   int min_subgraph_size{1};
+  // While TensorRT allows an engine optimized for a given max batch size
+  // to run at any smaller size, the performance for those smaller
+  // sizes may not be as well-optimized. Therefore, Max batch is best
+  // equivalent to the runtime batch size.
+  int max_batch_size{1};
+  // For workspace_size, refer it from here:
+  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
+  int workspace_size{1 << 30};
 };
 
 // A factory to help create different predictors.
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index b52d083f280e5e7713600a7b748dedd37aca0a1e..a610687a5b11999a7cb7426dbe961e5972ee1746 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,4 @@
-nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto)
+nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 6863b035d8cd9dfb21aed3947226a796778912a4..2a449eb95e3537a11962912a6a3f29e89958fbd8 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Add TRT tests
 nv_library(tensorrt_converter
   SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-activation_op.cc softmax_op.cc
+batch_norm_op.cc activation_op.cc softmax_op.cc 
   DEPS tensorrt_engine operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -24,3 +24,6 @@ nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
 
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
+
+nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..94f8b0ae5606d39a722ffe28501645c9b6fc5d2e
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <math.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class BatchNormOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    LOG(INFO) << "convert a fluid batch norm op to tensorrt batch_norm";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1);   // Bias is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1);   // Mean is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1);  // Scale is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Input("Variance").size(),
+                      1);  // Variance is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    // Declare weights
+    auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
+    auto* Mean_v = scope.FindVar(op_desc.Input("Mean").front());
+    auto* Scale_v = scope.FindVar(op_desc.Input("Scale").front());
+    auto* Variance_v = scope.FindVar(op_desc.Input("Variance").front());
+    const float eps = boost::get<float>(op_desc.GetAttr("epsilon"));
+
+    PADDLE_ENFORCE_NOT_NULL(Bias_v);
+    PADDLE_ENFORCE_NOT_NULL(Mean_v);
+    PADDLE_ENFORCE_NOT_NULL(Scale_v);
+    PADDLE_ENFORCE_NOT_NULL(Variance_v);
+
+    // get tensor
+    auto* Bias_t = Bias_v->GetMutable<framework::LoDTensor>();
+    auto* Mean_t = Mean_v->GetMutable<framework::LoDTensor>();
+    auto* Scale_t = Scale_v->GetMutable<framework::LoDTensor>();
+    auto* Variance_t = Variance_v->GetMutable<framework::LoDTensor>();
+
+    // create temp tensor for weights
+    framework::LoDTensor bias_tensor;
+    framework::LoDTensor mean_tensor;
+    framework::LoDTensor scale_tensor;
+    framework::LoDTensor variance_tensor;
+
+    bias_tensor.Resize(Bias_t->dims());
+    mean_tensor.Resize(Mean_t->dims());
+    scale_tensor.Resize(Scale_t->dims());
+    variance_tensor.Resize(Variance_t->dims());
+
+    platform::CPUPlace cpu_place;
+    // copy data from gpu to cpu
+    TensorCopySync((*Bias_t), cpu_place, &bias_tensor);
+    TensorCopySync((*Mean_t), cpu_place, &mean_tensor);
+    TensorCopySync((*Scale_t), cpu_place, &scale_tensor);
+    TensorCopySync((*Variance_t), cpu_place, &variance_tensor);
+
+    auto* bias_data = bias_tensor.mutable_data<float>(platform::CPUPlace());
+    auto* mean_data = mean_tensor.mutable_data<float>(platform::CPUPlace());
+    auto* scale_data = scale_tensor.mutable_data<float>(platform::CPUPlace());
+    auto* variance_data =
+        variance_tensor.mutable_data<float>(platform::CPUPlace());
+
+    std::unique_ptr<framework::LoDTensor> combile_scale_tensor(
+        new framework::LoDTensor());
+    std::unique_ptr<framework::LoDTensor> combile_bias_tensor(
+        new framework::LoDTensor());
+
+    combile_scale_tensor->Resize(scale_tensor.dims());
+    combile_bias_tensor->Resize(bias_tensor.dims());
+
+    auto* combile_scale_data =
+        combile_scale_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* combile_bias_data =
+        combile_bias_tensor->mutable_data<float>(platform::CPUPlace());
+
+    size_t ele_num = combile_scale_tensor->memory_size() / sizeof(float);
+
+    for (size_t i = 0; i < ele_num; i++) {
+      float scale = scale_data[i];
+      float bias = bias_data[i];
+      float mean = mean_data[i];
+      float variance = variance_data[i];
+      combile_scale_data[i] = scale / sqrtf(variance + eps);
+      combile_bias_data[i] = bias - mean * combile_scale_data[i];
+    }
+
+    TensorRTEngine::Weight scale_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(combile_scale_data),
+        combile_scale_tensor->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight shift_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(combile_bias_data),
+        combile_bias_tensor->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    nvinfer1::IScaleLayer* layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast<nvinfer1::ITensor*>(X),
+                             nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(),
+                             scale_weights.get(), power_weights.get());
+
+    auto output_name = op_desc.Output("Y").front();
+    engine_->weight_map[op_desc.Input("Bias").front()] =
+        std::move(combile_bias_tensor);
+    engine_->weight_map[op_desc.Input("Scale").front()] =
+        std::move(combile_scale_tensor);
+
+    engine_->SetITensor(output_name, layer->getOutput(0));
+
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(batch_norm, BatchNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index dba1d50b2d1c487ced8e6ca51f2d257641ad5fc7..841a95db38ce7cf0cb5961ff04cb569ee2633e6f 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -35,12 +35,20 @@ class Conv2dOpConverter : public OpConverter {
     auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
     PADDLE_ENFORCE_NOT_NULL(Y_v);
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
 
-    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
-    const int n_output = Y_t->dims()[0];
-    const int filter_h = Y_t->dims()[2];
-    const int filter_w = Y_t->dims()[3];
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> weight_tensor(
+        new framework::LoDTensor());
+    weight_tensor->Resize(Y_t->dims());
+    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+
+    auto* weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
+
+    PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+    const int n_output = weight_tensor->dims()[0];
+    const int filter_h = weight_tensor->dims()[2];
+    const int filter_w = weight_tensor->dims()[3];
 
     const int groups = boost::get<int>(op_desc.GetAttr("groups"));
     const std::vector<int> dilations =
@@ -57,7 +65,7 @@ class Conv2dOpConverter : public OpConverter {
 
     TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                   static_cast<void*>(weight_data),
-                                  Y_t->memory_size() / sizeof(float)};
+                                  weight_tensor->memory_size() / sizeof(float)};
 
     TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
     auto* layer = TRT_ENGINE_ADD_LAYER(
@@ -70,6 +78,8 @@ class Conv2dOpConverter : public OpConverter {
     layer->setNbGroups(groups);
 
     auto output_name = op_desc.Output("Output").front();
+    engine_->weight_map[op_desc.Input("Filter").front()] =
+        std::move(weight_tensor);
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {
       engine_->DeclareOutput(output_name);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 3744550f60a1696aedd8a3ecd24f1b21d22325b9..60a72b4eb5c75b5cd12305f13763a9a1a567213f 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
@@ -40,10 +39,17 @@ class ElementwiseWeightOpConverter : public OpConverter {
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
     PADDLE_ENFORCE_NOT_NULL(Y_v);
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> weight_tensor(
+        new framework::LoDTensor());
+    weight_tensor->Resize(Y_t->dims());
+    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+    auto* weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
     auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
-    std::vector<int> dims_y = framework::vectorize2int(Y_t->dims());
+    std::vector<int> dims_y = framework::vectorize2int(weight_tensor->dims());
     if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
       if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
     }
@@ -70,9 +76,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
       PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
     }
 
-    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
-                                         static_cast<void*>(weight_data),
-                                         Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight shift_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
+        weight_tensor->memory_size() / sizeof(float)};
     TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                          0};
     TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
@@ -82,6 +88,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
         engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
         shift_weights.get(), scale_weights.get(), power_weights.get());
     auto output_name = op_desc.Output("Out")[0];
+
+    engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 39fe1f609d7b94638506877fc301f19ef33ec8ac..ad98d85aae9cf594922aca00c43718ccfbce2278 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -12,12 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace inference {
@@ -73,19 +68,26 @@ class FcOpConverter : public OpConverter {
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
     // This may trigger a GPU->CPU copy, because TRT's weight can only be
     // assigned from CPU memory, that can't be avoided.
-    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
-    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
-    size_t n_output = Y_t->dims()[1];
+    platform::CPUPlace cpu_place;
+    framework::LoDTensor weight_tensor;
+    weight_tensor.Resize(Y_t->dims());
+    TensorCopySync((*Y_t), cpu_place, &weight_tensor);
 
-    framework::LoDTensor tmp;
-    tmp.Resize(Y_t->dims());
-    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), weight_data,
+    auto* weight_data = weight_tensor.mutable_data<float>(platform::CPUPlace());
+
+    PADDLE_ENFORCE_EQ(weight_tensor.dims().size(), 2UL);  // a matrix
+    size_t n_output = weight_tensor.dims()[1];
+
+    std::unique_ptr<framework::Tensor> tmp(new framework::LoDTensor());
+    tmp->Resize(weight_tensor.dims());
+
+    memcpy(tmp->mutable_data<float>(platform::CPUPlace()), weight_data,
            Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
     TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                   static_cast<void*>(weight_data),
                                   Y_t->memory_size() / sizeof(float)};
     TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
-                                      static_cast<void*>(tmp.data<float>()),
+                                      static_cast<void*>(tmp->data<float>()),
                                       Y_t->memory_size() / sizeof(float));
     weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
     tmp_weight.dims = weight.dims;
@@ -106,6 +108,7 @@ class FcOpConverter : public OpConverter {
 
     auto output_name = op_desc.Output("Out").front();
     engine_->SetITensor(output_name, layer->getOutput(0));
+    engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
     if (test_mode) {
       engine_->DeclareOutput(output_name);
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 11cad95361867476c6f775af778015da37f1cfb1..73f1b28ddf73403862e55d102a259d7b6cf67b1f 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -33,6 +33,7 @@ class Pool2dOpConverter : public OpConverter {
     PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
     auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
 
+    bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
     std::string pool_type =
         boost::get<std::string>(op_desc.GetAttr("pooling_type"));
     std::vector<int> ksize =
@@ -42,7 +43,13 @@ class Pool2dOpConverter : public OpConverter {
     std::vector<int> paddings =
         boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
 
-    const nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    if (global_pooling == true) {
+      nvinfer1::Dims input_shape = input1->getDimensions();
+      int nbDims = input_shape.nbDims;
+      nv_ksize.d[0] = input_shape.d[nbDims - 2];
+      nv_ksize.d[1] = input_shape.d[nbDims - 1];
+    }
     const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
     const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..41412cb079540da72760558379b158b6538aa6a8
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(batch_norm_op, test) {
+  std::unordered_set<std::string> parameters(
+      {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
+       "batch_norm_variance"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+  std::vector<int> param_shape{2};
+
+  validator.DeclInputVar("batch_norm_X", nvinfer1::DimsCHW(2, 5, 5));
+  validator.DeclParamVar("batch_norm_scale", param_shape);
+  validator.DeclParamVar("batch_norm_bias", param_shape);
+  validator.DeclParamVar("batch_norm_mean", param_shape);
+  validator.DeclParamVar("batch_norm_variance", param_shape);
+  validator.DeclOutputVar("batch_norm_Y", nvinfer1::DimsCHW(2, 5, 5));
+  validator.DeclOutputVar("batch_norm_save_mean", param_shape);
+  validator.DeclOutputVar("batch_norm_save_variance", param_shape);
+
+  // Prepare Op description
+  framework::OpDesc desc;
+
+  desc.SetType("batch_norm");
+  desc.SetInput("X", {"batch_norm_X"});
+  desc.SetInput("Scale", {"batch_norm_scale"});
+  desc.SetInput("Bias", {"batch_norm_bias"});
+  desc.SetInput("Mean", {"batch_norm_mean"});
+  desc.SetInput("Variance", {"batch_norm_variance"});
+  desc.SetOutput("Y", {"batch_norm_Y"});
+  desc.SetOutput("MeanOut", {"batch_norm_mean"});
+  desc.SetOutput("VarianceOut", {"batch_norm_variance"});
+  desc.SetOutput("SavedMean", {"batch_norm_save_mean"});
+  desc.SetOutput("SavedVariance", {"batch_norm_save_variance"});
+
+  float eps = 1e-5f;
+  bool is_test = true;
+  desc.SetAttr("epsilon", eps);
+  desc.SetAttr("is_test", is_test);
+
+  validator.SetOp(*desc.Proto());
+
+  std::unordered_set<std::string> neglected_output = {
+      "batch_norm_save_mean", "batch_norm_save_variance", "batch_norm_mean",
+      "batch_norm_variance"};
+  validator.Execute(3, neglected_output);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(batch_norm);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index d6651a5b244ba31a01220e6299cb2016ae61fe64..01d7f700da9cc67d0ebbd3d9649e3823f58a8811 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -57,6 +57,7 @@ TEST(OpConverter, ConvertBlock) {
   auto* x = scope.Var("conv2d-Y");
   auto* x_tensor = x->GetMutable<framework::LoDTensor>();
   x_tensor->Resize(framework::make_ddim(dim_vec));
+  x_tensor->mutable_data<float>(platform::CUDAPlace(0));
 
   OpConverter converter;
   converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope,
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index c5dddbc8cd37b9fb1ba39382af2da5ad045f3af2..aedd6b62df040eeee4e48f628128511cd8bf4439 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(Pool2dOpConverter, main) {
+void test_pool2d(bool global_pooling) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
   TRTConvertValidation validator(5, parameters, scope, 1 << 15);
@@ -28,7 +28,10 @@ TEST(Pool2dOpConverter, main) {
   // The ITensor's Dims should not contain the batch size.
   // So, the ITensor's Dims of input and output should be C * H * W.
   validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
-  validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
+  if (global_pooling)
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1));
+  else
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
 
   // Prepare Op description
   framework::OpDesc desc;
@@ -45,6 +48,7 @@ TEST(Pool2dOpConverter, main) {
   desc.SetAttr("ksize", ksize);
   desc.SetAttr("strides", strides);
   desc.SetAttr("paddings", paddings);
+  desc.SetAttr("global_pooling", global_pooling);
 
   LOG(INFO) << "set OP";
   validator.SetOp(*desc.Proto());
@@ -53,6 +57,10 @@ TEST(Pool2dOpConverter, main) {
   validator.Execute(3);
 }
 
+TEST(Pool2dOpConverter, normal) { test_pool2d(false); }
+
+TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true); }
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 4265f33f28fe36b1745baf4761c3c85e3a281d6b..0a6f171fc40a838fd81d6a51aca0430d5526f188 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
@@ -48,11 +49,17 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
   auto dims = tensor->dims();
   size_t num_elements = analysis::AccuDims(dims, dims.size());
   PADDLE_ENFORCE_GT(num_elements, 0);
-  auto* data = tensor->mutable_data<float>(place);
+
+  platform::CPUPlace cpu_place;
+  framework::LoDTensor temp_tensor;
+  temp_tensor.Resize(dims);
+  auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
 
   for (size_t i = 0; i < num_elements; i++) {
-    *(data + i) = random(0., 1.);
+    *(temp_data + i) = random(0., 1.);
   }
+
+  TensorCopySync(temp_tensor, place, tensor);
 }
 
 /*
@@ -91,18 +98,26 @@ class TRTConvertValidation {
     engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
   }
 
+  void DeclParamVar(const std::string& name, const std::vector<int> dim_vec) {
+    DeclVar(name, dim_vec);
+  }
+
   // Declare a parameter varaible in the scope.
   void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
     DeclVar(name, dims, true);
   }
 
+  void DeclOutputVar(const std::string& name, const std::vector<int> dim_vec) {
+    DeclVar(name, dim_vec);
+  }
+
   void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
     DeclVar(name, dims);
   }
 
   void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
-    platform::CPUPlace place;
-    platform::CPUDeviceContext ctx(place);
+    platform::CUDAPlace place;
+    platform::CUDADeviceContext ctx(place);
 
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
@@ -141,18 +156,22 @@ class TRTConvertValidation {
       PADDLE_ENFORCE(var);
       auto tensor = var->GetMutable<framework::LoDTensor>();
 
-      engine_->SetInputFromCPU(
+      engine_->SetInputFromGPU(
           input, static_cast<void*>(tensor->data<void>()),
           sizeof(float) *
               analysis::AccuDims(tensor->dims(), tensor->dims().size()));
     }
   }
 
-  void Execute(int batch_size) {
+  // We use the set 'neglected_output' here, because some Ops like batch norm,
+  // the outputs specified in the op des are only used during training,
+  // so we should neglect those output during inference.
+  void Execute(int batch_size,
+               std::unordered_set<std::string> neglected_output = {}) {
     // Execute Fluid Op
     PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
-    platform::CPUPlace place;
-    platform::CPUDeviceContext ctx(place);
+    platform::CUDAPlace place;
+    platform::CUDADeviceContext ctx(place);
     op_->Run(scope_, place);
     // Execute TRT.
     engine_->Execute(batch_size);
@@ -161,6 +180,7 @@ class TRTConvertValidation {
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
     const size_t output_space_size = 3000;
     for (const auto& output : op_desc_->OutputArgumentNames()) {
+      if (neglected_output.count(output)) continue;
       std::vector<float> fluid_out;
       std::vector<float> trt_out(output_space_size);
       engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index b821c3d0bf425c46fae634fbf53f7ee63100ca5c..14e9e14d33d637ee68e37593cc48721e5169499f 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -33,6 +33,7 @@ void TensorRTEngine::Build(const DescType &paddle_model) {
 }
 
 void TensorRTEngine::Execute(int batch_size) {
+  freshDeviceId();
   batch_size_ = batch_size;
   std::vector<void *> buffers;
   for (auto &buf : buffers_) {
@@ -60,6 +61,7 @@ TensorRTEngine::~TensorRTEngine() {
 }
 
 void TensorRTEngine::FreezeNetwork() {
+  freshDeviceId();
   PADDLE_ENFORCE(infer_builder_ != nullptr,
                  "Call InitNetwork first to initialize network.");
   PADDLE_ENFORCE(infer_network_ != nullptr,
@@ -241,6 +243,13 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
 
 int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
 
+void TensorRTEngine::freshDeviceId() {
+  int count;
+  cudaGetDeviceCount(&count);
+  PADDLE_ENFORCE_LT(device_, count);
+  cudaSetDevice(device_);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 694468c419c20089de1cdecff1a903ad0cc6e99f..bd3ba4cea6551a7f6651e311e2649de191a6faa1 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/utils/singleton.h"
@@ -52,13 +53,15 @@ class TensorRTEngine : public EngineBase {
   };
 
   TensorRTEngine(int max_batch, int max_workspace,
-                 cudaStream_t* stream = nullptr,
+                 cudaStream_t* stream = nullptr, int device = 0,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
         stream_(stream ? stream : &default_stream_),
-        logger_(logger) {
-    cudaStreamCreate(&default_stream_);
+        logger_(logger),
+        device_(device) {
+    freshDeviceId();
+    cudaStreamCreate(stream_);
   }
 
   virtual ~TensorRTEngine();
@@ -119,6 +122,15 @@ class TensorRTEngine : public EngineBase {
   nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
   void SetRuntimeBatch(size_t batch_size);
   int GetRuntimeBatch();
+  int GetDevice() { return device_; }
+
+  // A pointer to CPU memory is needed of the TRT weight.
+  // Before TRT runs, fluid loads weight into GPU storage.
+  // so we need to copy the weights from GPU to CPU in our op converter.
+  // We use a map to store these weights for the weight memory is not released
+  // in advance, which affecting the construction of TRT Op.
+  std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
+      weight_map;
 
  private:
   // the max batch size
@@ -140,6 +152,8 @@ class TensorRTEngine : public EngineBase {
   std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
   std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
       itensor_map_;
+  // The specific GPU id that the TensorRTEngine bounded to.
+  int device_;
 
   // TensorRT related internal members
   template <typename T>
@@ -156,6 +170,10 @@ class TensorRTEngine : public EngineBase {
   infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
   infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
   infer_ptr<nvinfer1::IExecutionContext> infer_context_;
+  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
+  // ensure that the thread is associated with the correct device by calling
+  // freshDeviceId().
+  void freshDeviceId();
 };  // class TensorRTEngine
 
 // Add an layer__ into engine__ with args ARGS.
@@ -188,8 +206,8 @@ class TRT_EngineManager {
 
   // Create or get an engine called `name`
   TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
-                         const std::string& name) {
-    auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
+                         const std::string& name, int gpu_device = 0) {
+    auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device);
     engines_[name].reset(p);
     return p;
   }
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index dc03702990587bf5e65d28da662d10df4d882110..da1f6535cb3b2476cd475797861d6d2bb6d88856 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,7 +27,7 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    ASSERT_EQ(0, cudaStreamCreate(&stream_));
+    // ASSERT_EQ(0, cudaStreamCreate(&stream_));
     engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
     engine_->InitNetwork();
   }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e8b5dec9d49f5613cec92441d19ab7dc1a1ad90c..68fbde2c09fd9a9e84fd7f1202fe474beb0e81b9 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -9,7 +9,6 @@ function(op_library TARGET)
     # op_library is a function to create op library. The interface is same as
     # cc_library. But it handle split GPU/CPU code and link some common library
     # for ops.
-    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
     set(cc_srcs)
     set(cu_srcs)
     set(hip_cu_srcs)
@@ -84,6 +83,16 @@ function(op_library TARGET)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
 
+    #remove windows unsupported op
+    if (WIN32)
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
+        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
+          return()
+        endif()
+    endforeach()
+    endif(WIN32)
+    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
+
     list(LENGTH op_library_DEPS op_library_DEPS_len)
     if (${op_library_DEPS_len} GREATER 0)
         set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
@@ -100,7 +109,8 @@ function(op_library TARGET)
     endif()
 
     # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
+"tensor_array_read_write_op" "tensorrt_engine_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -180,19 +190,19 @@ function(op_library TARGET)
 endfunction()
 
 add_subdirectory(math)
+if (NOT WIN32)
 add_subdirectory(nccl)
-
 if(WITH_GPU)
     op_library(nccl_op DEPS nccl_common)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
 else()
     set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
+endif() # NOT WIN32
 
 set(DISTRIBUTE_DEPS "")
 if(WITH_DISTRIBUTE)
     add_subdirectory(distributed)
-    
     set(DISTRIBUTE_DEPS "")
     if(WITH_GRPC)
         set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
@@ -221,7 +231,7 @@ if(WITH_DISTRIBUTE)
     #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
     #        listen_and_serv_op sum_op executor SERIAL)
-    if(WITH_GPU)
+    if(WITH_GPU AND NOT WIN32)
         set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
         cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
         if(WITH_GRPC)
@@ -232,7 +242,7 @@ if(WITH_DISTRIBUTE)
         set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     else()
         set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
-    endif()
+    endif() # WITH_GPU AND NOT WIN32
 else()
     set(DEPS_OPS ${DEPS_OPS}  checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
@@ -248,6 +258,7 @@ op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
     op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
     nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
       DEPS tensorrt_engine_op
       analysis)
@@ -329,5 +340,7 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
+if(NOT WIN32)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index d3a7ceed466a9b5e4d773f1531d198adff97eac2..27487b396ccf63d962defa6b270063ccb409164e 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -26,8 +26,6 @@ namespace plat = paddle::platform;
       act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
                                                  ops::grad_functor<float>>, \
       ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>,                 \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<plat::float16>>);
+                                ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 48f3b5a5bc06fbc211895a1a6d1521cfd97e0086..912415192659dc004f54a76e9cd1a20581d512a6 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -333,7 +333,8 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0.5) * dout / out;
+    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
   }
 };
 
@@ -739,7 +740,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor) - static_cast<T>(1));
+                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
   }
 };
 
@@ -862,11 +863,10 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    T b = static_cast<T>(beta);
     auto temp1 = static_cast<T>(1) /
-                 (static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
-    dx.device(d) = dout * ((b * out) + temp2);
+                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
+    dx.device(d) = dout * ((beta * out) + temp2);
   }
 };
 
diff --git a/paddle/fluid/operators/assign_value_op.cu.cc b/paddle/fluid/operators/assign_value_op.cu.cc
index 0ff174b3884df63d54d6486b017cc1a15ab23103..08bfde5dc92de9c675e5b9b85f8e65a3bab8631c 100644
--- a/paddle/fluid/operators/assign_value_op.cu.cc
+++ b/paddle/fluid/operators/assign_value_op.cu.cc
@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/assign_value_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
-                        ops::AssignValueKernel<float>,
-                        ops::AssignValueKernel<plat::float16>);
+                        ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1cb65346ee2b755b48f8dd8f1456a32861c3a0b6
--- /dev/null
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -0,0 +1,422 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/attention_lstm_op.h"
+#include <sys/time.h>
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"),
+                 "Input(X) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("C0"),
+                 "Input(C0) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"),
+                 "Input(LSTMWeight) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("LSTMBias"),
+                 "Input(LSTMBias) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"),
+                 "Input(AttentionWeight) of AttentionLSTM should not be null.");
+
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                 "Output(Hidden) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                 "Output(Cell) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"),
+                 "Output(AttentionedX) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"),
+                 "Output(AttentionFCOut) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("LSTMX"),
+                 "Output(LSTMX) of AttentionLSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"),
+                 "Output(LSTMOUT) of AttentionLSTM should not be null.");
+
+  auto x_dims = ctx->GetInputDim("X");
+  const int M = x_dims[1];
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+  auto w_dims = ctx->GetInputDim("LSTMWeight");
+  const int D = w_dims[1] / 4;
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(LSTMWeight)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(w_dims[0], D + M,
+                    "LSTMWeight dims should be (%d + %d) * %d.", D + M, 4 * D);
+
+  auto b_dims = ctx->GetInputDim("LSTMBias");
+  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(b_dims[0], 1, "LSTMBias dims should be 1 x %d.", 4 * D);
+  PADDLE_ENFORCE_EQ(b_dims[1], 4 * D, "LSTMBias dims should be 1 x %d.", 4 * D);
+
+  auto c_dims = ctx->GetInputDim("C0");
+  PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
+  if (ctx->HasInput("H0")) {
+    auto h_dims = ctx->GetInputDim("H0");
+    PADDLE_ENFORCE(h_dims == c_dims,
+                   "The dimension of Input(H0) and Input(C0) "
+                   "should be the same.");
+  }
+
+  auto atten_w_dims = ctx->GetInputDim("AttentionWeight");
+  PADDLE_ENFORCE_EQ(atten_w_dims.size(), 2,
+                    "Input(AttentionWeight)'s rank must be 2.");
+  PADDLE_ENFORCE_EQ(atten_w_dims[0], M + D,
+                    "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
+  PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
+                    "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
+  if (ctx->HasInput("AttentionBias")) {
+    auto atten_b_dims = ctx->GetInputDim("AttentionBias");
+    PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,
+                      "Input(AttentionBias)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(atten_b_dims[0], 1,
+                      "AttentionBias shapes must be 1 * 1.");
+    PADDLE_ENFORCE_EQ(atten_b_dims[1], 1,
+                      "AttentionBias shapes must be 1 * 1.");
+  }
+
+  if (ctx->HasInput("AttentionScalar")) {
+    auto dims = ctx->GetInputDim("AttentionScalar");
+    PADDLE_ENFORCE_EQ(dims.size(), 2,
+                      "Input(AttentionScalar)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalar shapes must be 1 * 1.");
+    PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1.");
+  }
+
+  if (ctx->HasInput("AttentionScalarBias")) {
+    auto dims = ctx->GetInputDim("AttentionScalarBias");
+    PADDLE_ENFORCE(
+        ctx->HasInput("AttentionScalar"),
+        "AttentionScalar should not be null when have AttentionScalarBias.");
+    PADDLE_ENFORCE_EQ(dims.size(), 2,
+                      "Input(AttentionScalarBias)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalarBias shapes must be 1 * 1.");
+    PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalarBias shapes must be 1 * 1.");
+  }
+
+  framework::DDim out_dims({x_dims[0], D});
+  ctx->SetOutputDim("Hidden", out_dims);
+  ctx->SetOutputDim("Cell", out_dims);
+  ctx->SetOutputDim("AttentionedX", {x_dims[0], 1});
+  ctx->SetOutputDim("LSTMX", {1, M});
+  ctx->SetOutputDim("LSTMOUT", {1, 4 * D});
+  // AttentionFCOut should be reshape as (maxseqlen,1) in runtime
+  ctx->ShareLoD("X", "Hidden");
+  ctx->ShareLoD("X", "Cell");
+}
+
+framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+      ctx.device_context());
+}
+
+void AttentionLSTMOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) the input is a LodTensor, which support "
+           "variable-time length input sequence. The underlying tensor in "
+           "this LoDTensor is a matrix with shape (T X M), where T is the "
+           "total time steps in this mini-batch, M is the dim size of x.");
+  AddInput("C0",
+           "(Tensor) LSTM C0"
+           "This is a tensor with shape (N x D), where N is the batch size, D "
+           "is the gate size."
+           "C0 is necessary because of attention.");
+  AddInput("H0",
+           "(Tensor, optional) LSTM H0"
+           "This is a tensor with shape (N x D), where N is the "
+           "batch size and D is the gate size.")
+      .AsDispensable();
+  AddInput("AttentionWeight",
+           "(Tensor) the weights of attention fc. Always relu the fc result."
+           "The shape is ((M+D) x 1), where M is the dim size of x, D is the "
+           "gate size of LSTM.");
+  AddInput("AttentionBias",
+           "(Tensor, optional) the bias of attention fc."
+           "The shape is (1 x 1)")
+      .AsDispensable();
+  AddInput("AttentionScalar",
+           "(Tensor, optional) the scalar on the result of attentioned fc. "
+           "Always relu the Scalar."
+           "The shape is (1 x 1)")
+      .AsDispensable();
+  AddInput("AttentionScalarBias",
+           "(Tensor, optional) the scalar bias of attention fc."
+           "The shape is (1 x 1)")
+      .AsDispensable();
+  AddInput("LSTMWeight",
+           "(Tensor) the combined weight of LSTM"
+           " - The shape is ((D+M) x 4D), where D is the hidden gate size, M "
+           "is the dim size of x"
+           " - Weight = {W_forget, W_input, W_output, W_cell}");
+  AddInput("LSTMBias",
+           "(Tensor) the combined bias of LSTM, shape (1x4D)."
+           "Note: we should add the bias of hidden and context accorindg to "
+           "the same gate: "
+           "{B_forget, B_input, B_output, B_cell}");
+  AddOutput("Hidden",
+            "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("Cell",
+            "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("AttentionedX",
+            "(Tensor) shape is (T x 1), the result after X * AttentionWeight,"
+            " where T is the total time steps in this mini-batch,"
+            " D is the hidden size.")
+      .AsIntermediate();
+  AddOutput("AttentionFCOut",
+            "(Tensor) (max_seq_len, 1), compute at each step.")
+      .AsIntermediate();
+  AddOutput("LSTMX",
+            "(Tensor) the input X of LSTM for each step."
+            "Shape is (1 x M), where M is the x frame size")
+      .AsIntermediate();
+  AddOutput(
+      "LSTMOUT",
+      "(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step."
+      "Shape is (1 x 4D), where M is the x frame size")
+      .AsIntermediate();
+  AddAttr<std::string>("gate_activation",
+                       "(string, default: sigmoid)"
+                       "The activation for input gate, forget gate and output "
+                       "gate, `sigmoid` by default.")
+      .SetDefault("sigmoid")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("cell_activation",
+                       "(string, default: tanh)"
+                       "The activation for cell output, `tanh` by defalut.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("candidate_activation",
+                       "(string, default: tanh)"
+                       "The activation for candidate hidden state, "
+                       "`tanh` by default.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddComment(R"DOC(
+Attention Long-Short Term Memory (LSTM) Operator.
+
+Attention part:
+concat( x(seqlen * M), expand( cell_t-1(1,D) ) ) => tmp(seqlen*(M+D))
+
+tmp(seqlen*(M+D)) * fc((M+D)*1) => fcout(seqlen*1) with bias, relu
+
+fcout(seqlen*1) * scalar => fcout(seqlen*1) with bias, relu
+
+dotmul and sum pool ( fcout(seqlen*1), x(seqlen * M) ) => lstm_x_t(1, M) 
+
+LSTM part:
+use lstm_x_t as input and compute as standard LSTM.
+
+)DOC");
+}
+
+// y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0;
+template <typename T>
+inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
+  if (bias) {
+    for (int i = 0; i < n; ++i) {
+      y[i] = x[i] + bias[0];
+    }
+    math::vec_relu<T>(n, y, y);
+  } else {
+    math::vec_relu<T>(n, x, y);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void vec_softmax(const math::BlasT<DeviceContext, T>& blas, const int n,
+                        const T* x, T* y) {
+  T scalar = x[0];
+  // max
+  for (int i = 1; i < n; ++i) {
+    scalar = scalar < x[i] ? x[i] : scalar;
+  }
+
+  // sub
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] - scalar;
+  }
+
+  // exp
+  blas.VEXP(n, y, y);
+
+  // sum
+  scalar = T(0);
+  for (int i = 0; i < n; ++i) {
+    scalar += y[i];
+  }
+
+  // scale
+  blas.SCAL(n, static_cast<T>(1) / scalar, y);
+}
+
+template <typename T>
+class AttentionLSTMKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+    auto* atten_w = ctx.Input<Tensor>("AttentionWeight");
+    auto* atten_b = ctx.Input<Tensor>("AttentionBias");
+    auto* atten_scalar = ctx.Input<Tensor>("AttentionScalar");
+    auto* atten_scalar_bias = ctx.Input<Tensor>("AttentionScalarBias");
+    auto* lstm_w = ctx.Input<Tensor>("LSTMWeight");
+    auto* lstm_b = ctx.Input<Tensor>("LSTMBias");
+
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    auto* atted_x = ctx.Output<Tensor>("AttentionedX");
+    auto* fc_out = ctx.Output<Tensor>("AttentionFCOut");
+    auto* lstm_x = ctx.Output<Tensor>("LSTMX");
+    auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");
+
+    // some shape should be reshape here since infershape can not get lod info
+    auto x_lod = x->lod();
+    const int N = x_lod[0].size() - 1;  // batch size
+    auto x_dims = x->dims();            // T x M
+    auto w_dims = lstm_w->dims();       // (D+M) x 4D
+    const int total_T = x_dims[0];
+    const int M = x_dims[1];      // x frame size
+    const int D = w_dims[1] / 4;  // gate frame size
+    const int D2 = D * 2;
+    const int D3 = D * 3;
+    const int D4 = w_dims[1];
+    int max_seq_len = x_lod[0][1];
+    for (int i = 1; i < N; ++i) {
+      int len = x_lod[0][i + 1] - x_lod[0][i];
+      max_seq_len = max_seq_len < len ? len : max_seq_len;
+    }
+    PADDLE_ENFORCE_EQ(x_lod.size(), 1, "Input(X)'s lod size must be 1.");
+    PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
+    fc_out->Resize({max_seq_len, 1});
+
+    math::VecActivations<T> act_functor;
+    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
+    act_gate = act_functor(ctx.Attr<std::string>("gate_activation"));
+    act_cell = act_functor(ctx.Attr<std::string>("cell_activation"));
+    act_cand = act_functor(ctx.Attr<std::string>("candidate_activation"));
+
+    const T* x_data = x->data<T>();
+    const T* h0_data = h0 ? h0->data<T>() : NULL;
+    const T* c0_data = c0->data<T>();
+    const T* lstm_w_data = lstm_w->data<T>();
+    const T* lstm_b_data = lstm_b->data<T>();
+    const T* atten_w_data = atten_w->data<T>();
+    const T* atten_b_data = atten_b ? atten_b->data<T>() : NULL;
+    const T* atten_scalar_data = atten_scalar ? atten_scalar->data<T>() : NULL;
+    const T* atten_scalar_bias_data =
+        atten_scalar_bias ? atten_scalar_bias->data<T>() : NULL;
+
+    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
+    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
+    T* atted_x_data = atted_x->mutable_data<T>(ctx.GetPlace());
+    T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
+    T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
+    T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());
+
+    // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, 1, M, x_data, atten_w_data,
+                                      atted_x_data, atten_b_data);
+
+    const T* cur_atten_x_data = atted_x_data;
+    const T* cur_x_data = x_data;
+    const T* prev_cell_data = NULL;
+    const T* prev_hidden_data = NULL;
+    T* cur_cell_out_data = cell_out_data;
+    T* cur_hidden_out_data = hidden_out_data;
+    for (int i = 0; i < N; ++i) {
+      int seq_len = x_lod[0][i + 1] - x_lod[0][i];
+      prev_cell_data = c0_data + i * D;
+      prev_hidden_data = h0_data ? h0_data + i * D : NULL;
+      for (int step = 0; step < seq_len; ++step) {
+        /// 1. compute attention vector
+        // 1a. prev_cell(1xD) * fc(D) rest part of atten_wgt
+        T prev_cell_bias = blas.DOT(D, prev_cell_data, atten_w_data + M);
+        // 1b. add cell bias and relu
+        bias_relu<T>(seq_len, cur_atten_x_data, &prev_cell_bias, fc_out_data);
+        // 1c. fc scalar
+        if (atten_scalar_data) {
+          blas.SCAL(seq_len, *atten_scalar_data, fc_out_data);
+          bias_relu<T>(seq_len, fc_out_data, atten_scalar_bias_data,
+                       fc_out_data);
+        }
+        // 1d. softmax
+        vec_softmax<DeviceContext, T>(blas, seq_len, fc_out_data, fc_out_data);
+        // mul x(seq_len*M) and sum pool
+        math::FCCompute<DeviceContext, T>(blas, 1, M, seq_len, fc_out_data,
+                                          cur_x_data, lstm_x_data);
+
+        /// 2. compute LSTM step
+        // lstm weight : concat[forget , input , output , tilde]
+        // shape : (D + M) x (4 * D)
+        // fc inputX(1xM) * weightX(M*(4D))  => 1 x 4D
+        blas.MatMul(1, D4, M, lstm_x_data, lstm_w_data + D * D4, lstm_out_data);
+        if (prev_hidden_data) {
+          blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
+                    prev_hidden_data, D, lstm_w_data, D4, static_cast<T>(1),
+                    lstm_out_data, D4);
+        }
+        // since input is 1xM, so can use add bias
+        blas.VADD(D4, lstm_b_data, lstm_out_data, lstm_out_data);
+
+        // gate act: sigmoid
+        act_gate(D3, lstm_out_data, lstm_out_data);
+        // candicate act: tanh
+        act_cand(D, lstm_out_data + D3, lstm_out_data + D3);
+
+        // a = forget * prev_cell
+        blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data);
+
+        // b = input * tilde
+        blas.VMUL(D, lstm_out_data + D, lstm_out_data + D3, lstm_out_data + D);
+
+        // cell_out = a + b
+        blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data);
+
+        // state act tanh(cell_out) * output_gate
+        act_cell(D, cur_cell_out_data, lstm_out_data);
+        blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data);
+
+        prev_hidden_data = cur_hidden_out_data;
+        prev_cell_data = cur_cell_out_data;
+        cur_cell_out_data = cur_cell_out_data + D;
+        cur_hidden_out_data = cur_hidden_out_data + D;
+      }
+      cur_x_data = cur_x_data + seq_len * M;
+      cur_atten_x_data = cur_atten_x_data + seq_len;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp,
+                  ops::AttentionLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OP_CPU_KERNEL(attention_lstm, ops::AttentionLSTMKernel<float>,
+                       ops::AttentionLSTMKernel<double>);
diff --git a/paddle/fluid/operators/attention_lstm_op.h b/paddle/fluid/operators/attention_lstm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ede3a7f3c96dd2d13d7c5c19816647e16a3c8d0
--- /dev/null
+++ b/paddle/fluid/operators/attention_lstm_op.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class AttentionLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class AttentionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 5912a1a17cbd29c3ebd83f37133c044f0905c8bd..969f75544fa42b948e982569c3d6105d3ce282d6 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -135,7 +135,7 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Variance",
              "The global variance (for training) "
              "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization").Reuse("X");
+    AddOutput("Y", "result after normalization");
     AddOutput("MeanOut",
               "Share memory with Mean. "
               "Store the global mean when training")
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index a496301526f58875ff51aeaa5b2094c3c656531c..78be2e1e1f06c7a518e35a770c1dc9581b2d10fe 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -62,9 +62,21 @@ class ConcatGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out_grad =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
     auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
-    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    auto outs =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+
+    {
+      auto dx = outs;
+      auto x = ins;
+      for (size_t i = 0; i < dx.size(); ++i) {
+        if (dx[i] != nullptr) {
+          dx[i]->set_lod(x[i]->lod());
+        }
+      }
+    }
+
     int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
 
     // get output tensor that the name is not kEmptyVarName
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 580fde753816c30b188b8a99cc63fcbafde64e25..135254ce6b6bf9add7bb1f0c3f645ed47081fba4 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -29,9 +29,9 @@ class ConditionalOp : public framework::OperatorBase {
 
  protected:
   std::vector<const framework::LoDTensor *> InputTensors(
-      const framework::Scope &scope) const {
+      const framework::Scope &scope, const std::string &in_name) const {
     std::vector<const framework::LoDTensor *> retv;
-    auto xs = Inputs("X");
+    auto xs = Inputs(in_name);
     retv.resize(xs.size(), nullptr);
     std::transform(
         xs.begin(), xs.end(), retv.begin(),
@@ -81,12 +81,18 @@ class ConditionalBlockOp : public ConditionalOp {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-    auto xs = InputTensors(scope);
-
     bool need_run;
     if (Attr<bool>("is_scalar_condition")) {
+      // When is_scalar_condition is True, the conditional variable is a scalar,
+      // whether need to execute the operators in sub-block depends on the
+      // conditional variable (Cond).
+      auto xs = InputTensors(scope, "Cond");
       need_run = ScalarCondition(xs);
     } else {
+      // When is_scalar_condition is False, the conditional variable maybe a
+      // vector or tensor, whether need to execute the operators in sub-block
+      // depends on the input variables (Input).
+      auto xs = InputTensors(scope, "Input");
       need_run = std::all_of(
           xs.begin(), xs.end(),
           [](const framework::LoDTensor *t) { return t->numel() != 0; });
@@ -110,11 +116,11 @@ class ConditionalBlockOp : public ConditionalOp {
 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X",
-             "The conditional variable of this operator. If X is empty, the "
+    AddInput("Cond",
+             "The conditional variable of this operator. If Cond is empty, the "
              "whole sub-block will not be executed.")
         .AsDuplicable();
-    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddInput("Input", "The input variables of the sub-block.").AsDuplicable();
     AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
     AddOutput("Scope",
               "(std::vector<Scope*>) The step scope of conditional block. To "
@@ -123,13 +129,18 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<framework::BlockDesc *>(
         "sub_block", "The step block of conditional block operator");
     AddAttr<bool>("is_scalar_condition",
-                  "the input X is used as scalar "
-                  "condition")
+                  "The conditional variable (Cond) is used as scalar "
+                  "condition.")
         .SetDefault(false);
     AddComment(R"DOC(Conditional block operator
 
-Run the sub-block if X is not empty. Params is the other inputs and Out is the
-outputs of the sub-block.
+If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar,
+run the operators in sub-block if Cond is True.
+
+If `is_scalar_condition` is False, the conditional variable (Cond) is a vector or
+tensor, run the operators in sub-block if all of input variables are not empty.
+
+
 )DOC");
   }
 };
@@ -145,12 +156,12 @@ class ConditionalBlockGradOp : public ConditionalOp {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-    auto xs = this->InputTensors(scope);
-
     bool need_run;
     if (Attr<bool>("is_scalar_condition")) {
+      auto xs = this->InputTensors(scope, "Cond");
       need_run = ScalarCondition(xs);
     } else {
+      auto xs = this->InputTensors(scope, "Input");
       need_run = std::all_of(
           xs.begin(), xs.end(),
           [](const framework::LoDTensor *t) { return t->numel() != 0; });
@@ -166,11 +177,11 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto *block = Attr<framework::BlockDesc *>("sub_block");
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
 
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
-                                  Outputs(framework::GradVarName("Params")));
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"),
+                                  Outputs(framework::GradVarName("Input")));
 
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
-                                  Outputs(framework::GradVarName("X")));
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"),
+                                  Outputs(framework::GradVarName("Cond")));
     }
   }
 
@@ -199,15 +210,15 @@ class ConditionalBlockGradOp : public ConditionalOp {
 class ConditionalBlockGradInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInputs("X"));
-    if (context->HasInputs("Params")) {
-      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
-      context->SetOutputsDim(framework::GradVarName("Params"),
-                             context->GetInputsDim("Params"));
+    PADDLE_ENFORCE(context->HasInputs("Cond"));
+    if (context->HasInputs("Input")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Input")));
+      context->SetOutputsDim(framework::GradVarName("Input"),
+                             context->GetInputsDim("Input"));
     }
-    if (context->HasOutputs(framework::GradVarName("X"))) {
-      context->SetOutputsDim(framework::GradVarName("X"),
-                             context->GetInputsDim("X"));
+    if (context->HasOutputs(framework::GradVarName("Cond"))) {
+      context->SetOutputsDim(framework::GradVarName("Cond"),
+                             context->GetInputsDim("Cond"));
     }
   }
 };
@@ -220,14 +231,15 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto grad_op = new framework::OpDesc();
     grad_op->SetType("conditional_block_grad");
-    grad_op->SetInput("X", Input("X"));
-    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Cond", Input("Cond"));
+    grad_op->SetInput("Input", Input("Input"));
     grad_op->SetInput("Out", Output("Out"));
     grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetInput("Scope", Output("Scope"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
-    grad_op->SetOutput(framework::GradVarName("Params"),
-                       InputGrad("Params", false));
+    grad_op->SetOutput(framework::GradVarName("Cond"),
+                       InputGrad("Cond", false));
+    grad_op->SetOutput(framework::GradVarName("Input"),
+                       InputGrad("Input", false));
     grad_op->SetBlockAttr("sub_block", this->grad_block_[0]);
     grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
     return std::unique_ptr<framework::OpDesc>(grad_op);
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 59bfe8f61d8ebb530ba617006650c0ef9215e2a6..22cbf680c0670552fb014043c69fcadc56863529 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -39,27 +39,6 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
     static_cast<size_t>(1024) * 1024 * 1024;
 
-template <typename T, typename DeviceContext>
-// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
-bool EnableFp16(const DeviceContext& dev_ctx,
-                cudnnConvolutionDescriptor_t cudnn_conv_desc) {
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  // Tensor core is supported since the volta GPU and
-  // is only enabled when input and filter data are float16
-  if (dev_ctx.GetComputeCapability() >= 70 &&
-      std::type_index(typeid(T)) ==
-          std::type_index(typeid(platform::float16))) {
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
-    return true;
-  } else {
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-  }
-#endif
-  return false;
-}
-
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
  public:
@@ -149,14 +128,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionFwdAlgo_t algo;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
-    if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else {
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &algo));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
     }
+#endif
 
     // get workspace size able to allocate
     CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@@ -296,9 +288,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else {
         data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
-      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
-        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-      }
 
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@@ -318,9 +307,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else {
         filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       }
-      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
-        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
-      }
 
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@@ -376,8 +362,7 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
 
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
@@ -385,5 +370,4 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index f07ab5a33b87d7945e5fcdf8f3644f0711ce643b..527a87db533ac25c3170fbb3ae6a9b9aff589b3d 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -126,6 +126,15 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
                                pipeline);
   }
 
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
+    auto bias_pd = conv_pd_->bias_primitive_desc();
+    return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
+                               "@bias_mem_p", pipeline);
+  }
+
   std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
       std::shared_ptr<mkldnn::memory> src_memory_p,
       std::shared_ptr<mkldnn::memory> weights_memory_p,
@@ -147,6 +156,28 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
     return conv_p;
   }
 
+  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> bias_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<mkldnn::convolution_forward>(
+          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
+          *(bias_memory_p.get()), *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
   std::shared_ptr<mkldnn::convolution_backward_weights>
   AcquireConvolutionBackwardWeights(
       std::shared_ptr<mkldnn::memory> src_memory_p,
@@ -229,6 +260,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
+    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
     PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
@@ -237,6 +269,17 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
                        filter->format() != memory::format::format_undef,
                    "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4,
+                   "Filter must be with 4 dimensions, i.e. OIHW");
+    if (bias) {
+      PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
+                         bias->format() != memory::format::format_undef,
+                     "Wrong layout/format set for Bias tensor");
+      PADDLE_ENFORCE(bias->dims().size() == 1,
+                     "Bias must only have 1 dimension, i.e. X");
+    }
 
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -253,11 +296,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const T* filter_data = filter->data<T>();
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
 
-    PADDLE_ENFORCE(input->dims().size() == 4,
-                   "Input must be with 4 dimensions, i.e. NCHW");
-    PADDLE_ENFORCE(filter->dims().size() == 4,
-                   "Filter must be with 4 dimensions, i.e. OIHW");
-
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> weights_tz =
         paddle::framework::vectorize2int(filter->dims());
@@ -288,13 +326,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
         weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+    std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
+                               // Currently used whenever bias is != nullptr.
     auto dst_md = platform::MKLDNNMemDesc(
         dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
     // create a conv primitive descriptor and save it for usage in backward
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
-        ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
-                             mkldnn_engine);
+    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
+    if (bias) {
+      bias_tz = paddle::framework::vectorize2int(bias->dims());
+      auto bias_md = platform::MKLDNNMemDesc(
+          bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
+      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
+                                     strides, paddings, mkldnn_engine);
+    } else {
+      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
+                                     paddings, mkldnn_engine);
+    }
     // Save conv_pd/src_memory/weights_memory for backward pass
     dev_ctx.SetBlob(key_conv_pd, conv_pd);
 
@@ -315,8 +363,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
 
     // create convolution op primitive
-    auto conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
-                                             dst_memory_p);
+    std::shared_ptr<mkldnn::convolution_forward> conv_p;
+    if (bias) {
+      const T* bias_data = bias->data<T>();
+      auto user_bias_md = platform::MKLDNNMemDesc(
+          {bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x);
+      auto user_bias_memory_p =
+          handler.AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
+
+      auto bias_memory_p =
+          handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          bias_memory_p, dst_memory_p);
+    } else {
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          dst_memory_p);
+    }
 
     // push primitive to stream and wait until it's executed
     pipeline.push_back(*conv_p);
@@ -346,6 +408,27 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
         p_conv_pd);
   }
+
+  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
+                       const memory::desc& bias, const memory::desc& dst,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const mkldnn::engine& engine) const {
+    memory::dims stride_dims = {strides[0], strides[1]};
+    memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto conv_desc = mkldnn::convolution_forward::desc(
+        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        bias, dst, stride_dims, padding_dims, padding_dims,
+        mkldnn::padding_kind::zero);
+
+    auto p_conv_pd =
+        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
+
+    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+        p_conv_pd);
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 37153d58439a90190eb2ad82d5dcc145e22dfa48..61ca80877a6dfcdf30a0ff346342116e36eec6f2 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -37,6 +37,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 
   auto in_dims = ctx->GetInputDim("Input");
   auto filter_dims = ctx->GetInputDim("Filter");
+
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   int groups = ctx->Attrs().Get<int>("groups");
@@ -57,7 +58,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
                     "The number of input channels should be equal to filter "
                     "channels * groups.");
-
   PADDLE_ENFORCE_EQ(
       filter_dims[0] % groups, 0,
       "The number of output channels should be divided by groups.");
@@ -122,6 +122,11 @@ void Conv2DOpMaker::Make() {
            "H is the height of the filter, and W is the width of the filter. "
            "If the groups attribute is greater than 1, C equals the number of "
            "input image channels divided by the groups.");
+  AddInput("Bias",
+           "(Tensor) Bias to be added to each output of filter application."
+           "The format of output tensor is X (one-dimensional) of size equal"
+           "to the number of output channels. Only used with MKL-DNN.")
+      .AsDispensable();
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator. "
             "The format of output tensor is also NCHW.")
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 3f5fab3b382bea97f43e4bc1b2cd436c956ba264..8181897c3d3844bda5574e85a08b2af038fcd664 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -85,6 +85,199 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     int* track_value =
         track.mutable_data<int>(emission_dims, platform::CPUPlace());
 
+#ifdef __AVX__
+// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
+// 16 elements per iteration. Then it can implement the parallel processing.
+// Only optimize for float type.
+#ifdef __AVX512F__
+    size_t step_size = 16;
+#else
+    size_t step_size = 8;
+#endif
+    if (std::is_same<T, float>::value && (tag_num >= step_size)) {
+      size_t steps = tag_num / step_size;
+      size_t remain = tag_num % step_size;
+      int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
+
+      // Setup the alpha initial value.
+      size_t i_offset = 0;
+      for (size_t i = 0; i <= steps; ++i) {
+#ifdef __AVX512F__
+        // Declare the variable for the content of weights, input and alpha
+        // values.
+        __m512 w_content, x_content, alpha_content;
+
+        // Load the relevant data into the variables from un-aligned address.
+        w_content = _mm512_loadu_ps((const float*)(w + i_offset));
+        x_content = _mm512_loadu_ps((const float*)(x + i_offset));
+        alpha_content = _mm512_add_ps(w_content, x_content);
+
+        // Save the alpha value.
+        _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
+                         alpha_content);
+#else
+        // Declare the variable for the content of weights, input and alpha
+        // values.
+        __m256 w_content, x_content, alpha_content;
+
+        // Load the relevant data into the variables from un-aligned address.
+        w_content = _mm256_loadu_ps((const float*)(w + i_offset));
+        x_content = _mm256_loadu_ps((const float*)(x + i_offset));
+        alpha_content = _mm256_add_ps(w_content, x_content);
+
+        // Save the alpha value.
+        _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
+                         alpha_content);
+#endif
+        i_offset += step_size;
+        if (i == steps - 1) {
+          if (remain > 0) {
+            i_offset += last_offset;
+          } else {
+            break;
+          }
+        }
+      }
+
+      // Use the column-major strategy to get the location of maximum score.
+      size_t seq_offset = 0;
+      for (size_t k = 1; k < seq_len; ++k) {
+        size_t j_offset = 0;
+        for (size_t j = 0; j <= steps; ++j) {
+#ifdef __AVX512F__
+          // Initialize the variables of maximum score and location.
+          __m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
+          __m512i max_j = _mm512_setzero_si512();
+#else
+          // Initialize the variables of maximum score and location.
+          __m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
+          __m256i max_j = _mm256_set1_epi32(0);
+#endif
+          // Calculate the offset of transition_weights.
+          size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
+          for (size_t i = 0; i < tag_num; ++i) {
+#ifdef __AVX512F__
+            // Initalize the content of alpha variable with related offset.
+            __m512 alpha_content =
+                _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
+            // Obtain the content of weights from un-aligned address.
+            __m512 w_content =
+                _mm512_loadu_ps((const float*)(w + trans_offset));
+
+            __m512 score_v = _mm512_add_ps(alpha_content, w_content);
+
+            __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
+
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm512_mask_set1_epi32(max_j, mask, i);
+
+            // Update the max_score value.
+            max_score = _mm512_max_ps(max_score, score_v);
+#else
+            // Initalize the content of alpha variable with related offset.
+            __m256 alpha_content = _mm256_broadcast_ss(
+                (const float*)(alpha_value + seq_offset + i));
+            // Obtain the content of weights from un-aligned address.
+            __m256 w_content =
+                _mm256_loadu_ps((const float*)(w + trans_offset));
+            __m256 score_v = _mm256_add_ps(alpha_content, w_content);
+
+            __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
+
+#ifdef __AVX2__
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm256_or_si256(
+                _mm256_andnot_si256((__m256i)mask, max_j),
+                _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
+#else
+            __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
+            __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
+            __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
+            __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
+
+            lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
+            hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
+            lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
+            hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
+
+            lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
+            hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
+
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
+            max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
+#endif
+
+            // Update the max_score value.
+            max_score = _mm256_max_ps(max_score, score_v);
+#endif
+            trans_offset += tag_num;
+          }
+
+#ifdef __AVX512F__
+          // Update the alpha and track values.
+          __m512 x_content = _mm512_loadu_ps(
+              (const float*)(x + seq_offset + tag_num + j_offset));
+          max_score = _mm512_add_ps(max_score, x_content);
+          _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
+                                                    tag_num + j_offset),
+                           max_score);
+          _mm512_storeu_si512(
+              reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
+                                         j_offset),
+              max_j);
+#else
+          // Update the alpha and track values.
+          __m256 x_content = _mm256_loadu_ps(
+              (const float*)(x + seq_offset + tag_num + j_offset));
+          max_score = _mm256_add_ps(max_score, x_content);
+          _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
+                                                    tag_num + j_offset),
+                           max_score);
+          _mm256_storeu_si256(
+              reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
+                                         j_offset),
+              max_j);
+#endif
+
+          // Calculate the offset of next step
+          j_offset += step_size;
+          if (j == steps - 1) {
+            if (remain > 0) {
+              j_offset += last_offset;
+            } else {
+              break;
+            }
+          }
+        }
+
+        seq_offset += tag_num;
+      }
+    } else {
+      for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
+
+      for (size_t k = 1; k < seq_len; ++k) {
+        for (size_t i = 0; i < tag_num; ++i) {
+          T max_score = -std::numeric_limits<T>::max();
+          int max_j = 0;
+          for (size_t j = 0; j < tag_num; ++j) {
+            T score = alpha_value[(k - 1) * tag_num + j] +
+                      w[(j + state_trans_base_idx) * tag_num + i];
+            if (score > max_score) {
+              max_score = score;
+              max_j = j;
+            }
+          }
+
+          alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
+          track_value[k * tag_num + i] = max_j;
+        }
+      }
+    }
+#else
     for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
 
     for (size_t k = 1; k < seq_len; ++k) {
@@ -105,6 +298,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
       }
     }
 
+#endif
     T max_score = -std::numeric_limits<T>::max();
     int max_i = 0;
     for (size_t i = 0; i < tag_num; ++i) {
diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu
index 65fd3a5dbc9ffed4c5d1114346fcc0660c183dae..30dbd5bd3d39dd2992c3dd91364003bb7715a2eb 100644
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 using CUDACtx = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(cross_entropy,
                         ops::CrossEntropyOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel<CUDACtx, double>,
-                        ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
+                        ops::CrossEntropyOpKernel<CUDACtx, double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index de1a503154deb967eb4389a9f43b86c05626d966..66784f0b5149a7c479a90a407709d993f4a40a8b 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -130,12 +130,13 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
       checkpoint_notify_id != -1,
       "when checkpoint_notify_id = -1, there should be no RPC invoke.");
 
-  auto* lt_var = scope->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+  // TODO(tangwei12): find out why scope will be error.
+  auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
   lt_var->clear();
   lt_var->append(out_var_name);
   VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
           << out_var_name;
-  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope);
+  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_);
   return true;
 }
 
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index b50830c362d3f6ecf38affbfa6a1ffe2ed77e125..d6176e1443d2a441af7878e5efe99796d486bb7a 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -78,10 +78,9 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
                          int64_t rows_numel) {
   CreateVarsOnScope(scope, place);
   auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
-  auto rows = w->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
   auto w_value = w->mutable_value();
   w_value->Resize({rows_numel, 10});
+  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
 
   auto ptr = w_value->mutable_data<float>(*place);
 
diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
index c86cd57316078778e5930c9b524b931d523028d7..9ad82aec8182d6ba06b67391d71317a3d0df1833 100644
--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -137,9 +137,10 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
+class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu
index f9f5c66d34fa1d73db00173e493f9953b8579518..dfff518f170b56d180b6883c363effb8dbd677b6 100644
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -30,5 +30,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
index 5356105e2e551c0528694091608fc7585dce66d2..c60cb1f92e99329d52f6ed39dccde406a5f83563 100644
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 
@@ -136,9 +137,11 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
 }
 
 template <typename DeviceContext, typename T>
-class ElementwiseAddGradKernel : public framework::OpKernel<T> {
+class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+
     using Tensor = framework::Tensor;
 
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/elementwise_div_op.cu b/paddle/fluid/operators/elementwise_div_op.cu
index 4cc7ba0f43c6031bf4a27222a17eca84bad5a668..588d1f7420241ba1697e5141e4e4a2870f2dc87c 100644
--- a/paddle/fluid/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
@@ -14,24 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_div_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>);
+                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise_div_op.h b/paddle/fluid/operators/elementwise_div_op.h
index 95649ac46e6bd41b9e1a865794cdec3ae1e6e247..41a7950bf0c598507c0fda48c6a43f2fd38c41d2 100644
--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
-
 namespace paddle {
 namespace operators {
 
@@ -53,9 +53,10 @@ struct DivGradDY {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseDivGradKernel : public framework::OpKernel<T> {
+class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_max_op.h b/paddle/fluid/operators/elementwise_max_op.h
index 527a18ee3ba88a158a13266a7fbcdafe59ec69d9..bfb5c931958b4ca890ea720af42dad91d5625abb 100644
--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
@@ -55,9 +56,10 @@ struct MaxGradDy {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
+class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_min_op.h b/paddle/fluid/operators/elementwise_min_op.h
index d4e5831463f3e54c72789b6876ea696cf1b4ef4b..db035ffb52e619b337c8190af4ed0e155aaac48d 100644
--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
-
 namespace paddle {
 namespace operators {
 
@@ -55,9 +55,10 @@ struct MinGradDy {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseMinGradKernel : public framework::OpKernel<T> {
+class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise_mul_op.cu
index 350d43168dea7e88127b0d28d663e680458e1dba..2fb1b4bee689c059625e3dbd59f80c541ace83a0 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
@@ -14,25 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_mul_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h
index dc73cb6f23614504640283af01981d3f69e89126..4437da4d95f97b5cbbca1650badf9710c26b4380 100644
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
@@ -23,6 +25,37 @@ struct MulFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
 };
 
+template <typename DeviceContext, typename T>
+void default_elementwise_mul(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y, framework::Tensor* z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        MulFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+  blas.VMUL(x->numel(), x->data<T>(), y->data<T>(),
+            z->mutable_data<T>(ctx.GetPlace()));
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value ||
+    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+}
+
 template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
@@ -33,9 +66,11 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     auto* y = ctx.Input<Tensor>("Y");
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          MulFunctor<T>(), z);
+    if (x->numel() == y->numel()) {
+      elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    } else {
+      default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    }
   }
 };
 
@@ -50,9 +85,10 @@ struct MulGradDY {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseMulGradKernel : public framework::OpKernel<T> {
+class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index d8a12e800ad733800c1ec333f15d31d4dcd1a3a5..a79b900b9801e6b80e4433a9acdd4dab6c34859d 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -205,6 +205,20 @@ class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
   }
 };
 
+template <typename T>
+class ElemwiseGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dx =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    if (dx != nullptr) {
+      auto& dout =
+          *context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+      dx->set_lod(dout.lod());
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 7223a972d23119c8ef93fb49bfe42922cc14571d..f90dcdc156590b776f817a4933d5a9b45868ba98 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -80,6 +80,9 @@ inline framework::DDim trim_trailing_singular_dims(
   for (int i = 0; i < actual_dims_size; ++i) {
     trim_dims[i] = dims[i];
   }
+  if (trim_dims.size() == 0) {
+    return framework::DDim(framework::make_dim());
+  }
   framework::DDim actual_dims = framework::make_ddim(trim_dims);
   return actual_dims;
 }
@@ -350,7 +353,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
   int j = blockIdx.x;
   int i = threadIdx.x;
   int tid = threadIdx.x;
-  T val(0);
+  T val = 0;
 
   do {
     int x_offset = i * w + j;
@@ -418,7 +421,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
   int tid = threadIdx.x;
   int j = blockIdx.x;
 
-  T val(0);
+  T val = 0;
   int ttid = tid;
 
   while (true) {
diff --git a/paddle/fluid/operators/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise_sub_op.cu
index ff3f6f8a2cb542c2fb6b43d539f6413b39250992..8709f686f9af1bf4dacbc2dfc3e2d5dcc1c59b9a 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
@@ -14,25 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_sub_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub_grad,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h
index 11c7e3fe628001f095836a788f2bcc7c4ee7ad4b..3385df0897700d37d60d8804a01db777ebc02a7e 100644
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
@@ -50,9 +51,10 @@ struct SubGradDY {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseSubGradKernel : public framework::OpKernel<T> {
+class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 099ca52c8e945a0e93c2f13adb612158c67397cf..fa4dec9cf118cef9b836943fd4eae90d23e6218a 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fc_op.h"
 #include <vector>
 #include "paddle/fluid/operators/math/blas.h"
-
-DECLARE_int32(paddle_num_threads);
+#include "paddle/fluid/operators/math/fc_compute.h"
 
 namespace paddle {
 namespace operators {
@@ -36,9 +35,14 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
 
   if (ctx->HasInput("Bias")) {
     auto bias_dims = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim].");
-    PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
-                      "The shape of Bias must be [1, dim].");
+    if (bias_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim].");
+      PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
+                        "The shape of Bias must be [1, dim].");
+    } else if (bias_dims.size() == 1) {
+      PADDLE_ENFORCE_EQ(bias_dims[0], w_dims[1],
+                        "The shape of Bias must be [1, dim].");
+    }
   }
   PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
                  "Fully Connected input should be 2-D or 4-D tensor.");
@@ -110,13 +114,8 @@ void FCOpMaker::Make() {
   AddComment(R"DOC(
   Fully Connected Operator.
 
-  The fully connected operation calculates the output based on the input, weights and bias attribute.
+  The fully connected operation calculates the output based on the input, weights and bias.
   The size of each dimension of the parameters checked in the infer-shape.
-  The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
-  Additional parametrs are use_mkldnn and bias_attr.
-  The input(X) size and output(Out) size may be diffrent.
-
-  The fully connected layer only supports MKLDNN version
 )DOC");
 }
 
@@ -133,26 +132,15 @@ class FCOpKernel : public framework::OpKernel<T> {
     auto in_dims = input->dims();
     auto w_dims = w->dims();
 
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(dev_ctx);
     const T* input_data = input->data<T>();
     const T* w_data = w->data<T>();
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    math::FCCompute<platform::CPUDeviceContext, T>(
+        blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data,
+        bias ? bias->data<T>() : NULL);
 
-    blas.GEMM(CblasNoTrans, CblasNoTrans, in_dims[0], w_dims[1], w_dims[0],
-              static_cast<T>(1), input_data, w_data, static_cast<T>(0),
-              output_data);
-
-    if (bias) {
-      const T* bias_data = bias->data<T>();
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
-#endif
-      for (int bs = 0; bs < in_dims[0]; bs++) {
-        blas.AXPY(w_dims[1], static_cast<T>(1), bias_data,
-                  output_data + bs * w_dims[1]);
-      }
-    }
+    // TODO(TJ): fuse act
   }
 };
 
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 862249269eaecdac262a691c884ea59f89f54061..130f18dde4f979a6a9925ede9cbf745fcec14d48 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -12,28 +12,48 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
 
-class FillConstantOp : public framework::OperatorWithKernel {
+class FillConstantInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
-    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
     ctx->SetOutputDim("Out", framework::make_ddim(shape));
   }
+};
+
+class FillConstantOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
+    auto value = Attr<float>("value");
+    auto force_cpu = Attr<bool>("force_cpu");
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    if (force_cpu) {
+      auto cpu = platform::CPUPlace();
+      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+    } else {
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
+    }
 
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    math::set_constant(dev_ctx, &out, value);
   }
 };
 
@@ -67,11 +87,6 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int64_t>)
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
deleted file mode 100644
index b2a2a7b2faedf9b94e01ed908ff39749973be1df..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_constant_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillConstantOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto value = ctx.Attr<float>("value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->Resize(framework::make_ddim(ctx.Attr<std::vector<int>>("shape")));
-    if (force_cpu) {
-      auto cpu = platform::CPUPlace();
-      out->mutable_data(cpu, framework::ToTypeIndex(data_type));
-    } else {
-      out->mutable_data(ctx.GetPlace(), framework::ToTypeIndex(data_type));
-    }
-
-    math::set_constant(ctx.template device_context<DeviceContext>(), out,
-                       value);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index 352a17c927bc70bdd6e4307951f0e0ac3d10ac2d..925dc19061e2196a40411f415eb6e5ad59ab52ff 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -70,6 +69,7 @@ class FillOp : public framework::OperatorBase {
 
     framework::VisitDataType(
         dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
+
     if (!force_cpu && platform::is_gpu_place(place)) {
       // Copy tensor to out
       platform::DeviceContextPool &pool =
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3888333ec5626f1d8d35db215085f483c985cf0a
--- /dev/null
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -0,0 +1,354 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fusion_lstm_op.h"
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+
+void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
+                 "Input(WeightX) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
+                 "Input(WeightH) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                 "Input(Bias) of LSTM should not be null.");
+
+  PADDLE_ENFORCE(ctx->HasOutput("XX"),
+                 "Output(XX) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                 "Output(Hidden) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                 "Output(Cell) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
+                 "Output(BatchedGate) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
+                 "Output(BatchedGate) of LSTM should not be null.");
+
+  auto x_dims = ctx->GetInputDim("X");
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+  if (ctx->HasInput("H0")) {
+    PADDLE_ENFORCE(ctx->HasInput("C0"),
+                   "Input(Cell) and Input(Hidden) of LSTM should not "
+                   "be null at the same time.");
+    auto h_dims = ctx->GetInputDim("H0");
+    auto c_dims = ctx->GetInputDim("C0");
+    PADDLE_ENFORCE(h_dims == c_dims,
+                   "The dimension of Input(H0) and Input(C0) "
+                   "should be the same.");
+  }
+
+  auto wx_dims = ctx->GetInputDim("WeightX");
+  PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
+                    "The rank of Input(WeightX) should be 2.");
+  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
+                    "The first dimension of Input(WeightX) "
+                    "should be %d.",
+                    x_dims[1]);
+
+  int frame_size = wx_dims[1] / 4;
+  auto wh_dims = ctx->GetInputDim("WeightH");
+  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
+                    "The rank of Input(WeightH) should be 2.");
+  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
+                    "The first dimension of Input(WeightH) "
+                    "should be %d.",
+                    frame_size);
+  PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
+                    "The second dimension of Input(WeightH) "
+                    "should be 4 * %d.",
+                    frame_size);
+
+  auto b_dims = ctx->GetInputDim("Bias");
+  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+  PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                    "The first dimension of Input(Bias) should be 1.");
+
+  PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"),
+                 "Do not support peephole yet.");
+  PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                    "The second dimension of Input(Bias) should be "
+                    "4 * %d if disable peepholes connection",
+                    frame_size);
+
+  framework::DDim out_dims({x_dims[0], frame_size});
+  ctx->SetOutputDim("Hidden", out_dims);
+  ctx->SetOutputDim("Cell", out_dims);
+  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
+  ctx->SetOutputDim("BatchCellPreAct", out_dims);
+  ctx->ShareLoD("X", "Hidden");
+  ctx->ShareLoD("X", "Cell");
+
+  int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->ShareLoD("X", "XX");
+}
+
+framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+      ctx.device_context());
+}
+
+void FusionLSTMOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) the input is a LodTensor, which support "
+           "variable-time length input sequence. The underlying tensor in "
+           "this LoDTensor is a matrix with shape (T X M), where T is the "
+           "total time steps in this mini-batch, M is the dim size of x.");
+  AddInput("WeightX",
+           "(Tensor) the learnable weights of X."
+           " - The shape is (M x 4D), where M is the dim size of x, D is the "
+           "hidden size. "
+           " - Weight = {W_cx, W_ix, W_fx, W_ox}");
+  AddInput("WeightH",
+           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
+           " - The shape is (D x 4D), where D is the hidden size. "
+           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+  AddInput("Bias",
+           "(Tensor) the learnable weights. Almost same as LSTMOp"
+           "Note: we should add the fc bias into this (1x4D) in bias."
+           "input-hidden bias weight and peephole connections weight if "
+           "setting `use_peepholes` True. "
+           "1. `use_peepholes = False` "
+           " - The shape is (1 x 4D). "
+           " - Bias = {b_c, b_i, b_f, b_o}."
+           "2. `use_peepholes = True` "
+           " - The shape is (1 x 7D). "
+           " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+  AddInput("H0",
+           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size and D is the hidden size.")
+      .AsDispensable();
+  AddInput("C0",
+           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size. `H0` and `C0` can be NULL but only at the same time.")
+      .AsDispensable();
+  AddOutput("Hidden",
+            "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("Cell",
+            "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("XX",
+            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
+            " or batched_X (size is T x M), this will be automatically chosen,"
+            " where T is the total time steps in this mini-batch,"
+            " D is the hidden size, M is the dim size of x input.")
+      .AsIntermediate();
+  AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate();
+  AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).")
+      .AsIntermediate();
+  AddAttr<bool>("use_peepholes",
+                "(bool, defalut: True) "
+                "whether to enable diagonal/peephole connections.")
+      .SetDefault(true);
+  AddAttr<bool>("is_reverse",
+                "(bool, defalut: False) "
+                "whether to compute reversed LSTM.")
+      .SetDefault(false);
+  AddAttr<std::string>("gate_activation",
+                       "(string, default: sigmoid)"
+                       "The activation for input gate, forget gate and output "
+                       "gate, `sigmoid` by default.")
+      .SetDefault("sigmoid")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("cell_activation",
+                       "(string, default: tanh)"
+                       "The activation for cell output, `tanh` by defalut.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("candidate_activation",
+                       "(string, default: tanh)"
+                       "The activation for candidate hidden state, "
+                       "`tanh` by default.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddComment(R"DOC(
+Fusion Long-Short Term Memory (LSTM) Operator.
+This operator fuse the X into LSTM, more details can refer to LSTM op.
+)DOC");
+}
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  // TODO(TJ): check mem copy perf
+  row_shuffle(ctx, src, index_lod, dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class FuisonLSTMKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* wx = ctx.Input<Tensor>("WeightX");
+    auto* wh = ctx.Input<Tensor>("WeightH");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
+    auto* xx = ctx.Output<LoDTensor>("XX");
+    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    const T* x_data = x->data<T>();
+    const T* wx_data = wx->data<T>();
+    auto x_dims = x->dims();
+    auto wx_dims = wx->dims();
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    if (x_dims[1] > wx_dims[1]) {
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
+                                        x_data, wx_data, xx_data,
+                                        bias->data<T>());
+      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+    } else {
+      to_batch(dev_ctx, *x, xx, true, is_reverse);
+      batched_gate->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
+                                        xx_data, wx_data, batched_gate_data,
+                                        bias->data<T>());
+    }
+
+    int frame_size = static_cast<int>(wx_dims[1] / 4);
+    framework::DDim out_dims({x_dims[0], frame_size});
+    math::LstmMetaValue<T> lstm_value;
+    // no peephole
+    lstm_value.check_ig = nullptr;
+    lstm_value.check_fg = nullptr;
+    lstm_value.check_og = nullptr;
+    lstm_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+
+    framework::Vector<size_t> order(batched_gate->lod()[2]);
+
+    if (cell_t0) {
+      // Since the batch computing for LSTM reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0,
+                                         true);
+      lstm_value.prev_state_value = ordered_c0.data<T>();
+    }
+
+    // Use the local variable as here.
+    LoDTensor batch_hidden, batch_cell;
+    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
+    batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
+    batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
+    batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace());
+
+    auto batch_starts = batched_gate->lod()[0];
+    size_t max_seq_len = batch_starts.size() - 1;
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+
+    for (size_t n = 0; n < max_seq_len; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batched_gate->Slice(bstart, bend);
+      Tensor out_t = batch_hidden.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
+        // TODO(TJ): use gemm directly
+        blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // TODO(TJ): move h0 outside for
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTM reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+        Tensor ordered_h0;
+        ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        // TODO(TJ): use gemm directly
+        blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
+                    static_cast<T>(1.0));
+      }
+
+      lstm_value.gate_value = gate_t.data<T>();
+      lstm_value.output_value = out_t.data<T>();
+      lstm_value.state_value = cell_t.data<T>();
+      lstm_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act,
+          cand_act);
+      lstm_value.prev_state_value = lstm_value.state_value;
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden.set_lod(batched_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(dev_ctx, batch_hidden, hidden_out);
+
+    batch_cell.set_lod(batched_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(dev_ctx, batch_cell, cell_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OP_CPU_KERNEL(
+    fusion_lstm,
+    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/fusion_lstm_op.h b/paddle/fluid/operators/fusion_lstm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f79601602348ac454fc6c0cefcba0643ad8e6e2
--- /dev/null
+++ b/paddle/fluid/operators/fusion_lstm_op.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class FusionLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index b4907237954ba478197d5ca8bdcbc3e1915e9dcf..7784856417e579fd43f79fa331d46df8af6c36b8 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -61,7 +60,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(gaussian_random,
                         paddle::operators::GPUGaussianRandomKernel<float>,
                         paddle::operators::GPUGaussianRandomKernel<double>);
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 27e26cb1b5c1e831f05dac299489628b92eaa58c..51219504ffa2a778b56351f759e8a8dfb951ad91 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -92,6 +92,7 @@ class LoadOp : public framework::OperatorBase {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
     framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
+    selectedRows->SyncIndex();
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
index 2ce11e712fb1a8aa9748313ec7cf4e895a931465..de3f0990e109cacd49c4d888bbc1f797fb196e01 100644
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -46,10 +45,6 @@ class LookupSparseTableOp : public framework::OperatorBase {
     auto out_var = scope.FindVar(Output("Out"));
     auto w_var = scope.FindVar(Input("W"));
     auto ids_var = scope.FindVar(Input("Ids"));
-    unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
-    float min = Attr<float>("min");
-    float max = Attr<float>("max");
-    bool auto_grown_table = Attr<bool>("auto_grown_table");
 
     PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
                    "The type of Out var should be LodTensor.");
@@ -60,46 +55,17 @@ class LookupSparseTableOp : public framework::OperatorBase {
     auto &ids_t = ids_var->Get<framework::LoDTensor>();
     auto out_t = out_var->GetMutable<framework::LoDTensor>();
     auto w_t = w_var->GetMutable<framework::SelectedRows>();
-    std::vector<int64_t> keys;
-    keys.resize(ids_t.numel());
-    for (int64_t i = 0; i < ids_t.numel(); ++i) {
-      keys[i] = ids_t.data<int64_t>()[i];
-    }
 
     // TODO(Yancey1989): support CUDA Place for the sparse table
     platform::CPUPlace cpu;
     auto out_shape = w_t->value().dims();
-    out_shape[0] = keys.size();
+    out_shape[0] = ids_t.numel();
     out_t->Resize(out_shape);
     out_t->mutable_data(cpu, w_t->value().type());
     PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
                       framework::proto::VarType::FP32,
                       "The sparse table only support FP32");
-    auto non_keys_pair = w_t->Get(keys, out_t);
-    if (!auto_grown_table) {
-      PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast<size_t>(0),
-                        "there is some keys does exists in the sparse table.");
-    }
-    auto value_shape = w_t->value().dims();
-    value_shape[0] = 1;
-    for (const auto &it : non_keys_pair) {
-      const auto key = it.first;
-      const auto index = it.second;
-      framework::Tensor value;
-      value.Resize(value_shape);
-      auto data = value.mutable_data<float>(cpu);
-
-      std::minstd_rand engine;
-      engine.seed(seed);
-      std::uniform_real_distribution<float> dist(min, max);
-      int64_t size = value.numel();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
-      w_t->Set(key, value);
-      memory::Copy(cpu, out_t->mutable_data<float>(cpu) + index * value.numel(),
-                   cpu, value.data<float>(), value.numel() * sizeof(float));
-    }
+    w_t->Get(ids_t, out_t, true);
   }
 };
 
@@ -121,21 +87,6 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
         .SetDefault(kNoPadding);
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
-        .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximum value of uniform random")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed used for generating samples. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
-        .SetDefault(0);
     AddAttr<bool>("auto_grown_table",
                   "(bool default false)"
                   "Whether create new value if for nonexistent key.")
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 2558154e0b39a4281bfaa59ba75867589d73be5d..da185d93c09f9b06bd5968b9c8e93176f9ef014b 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -90,6 +90,11 @@ class Blas {
   void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A,
             int lda, const T* B, int ldb, T beta, T* C, int ldc) const;
 
+  template <typename T>
+  void GEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+            T alpha, const T* A, int lda, const T* B, int ldb, T beta, T* C,
+            int ldc) const;
+
 #ifdef PADDLE_WITH_MKLML
   template <typename T>
   T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N,
@@ -109,6 +114,10 @@ class Blas {
   void GEMM_FREE(T* data) const;
 #endif
 
+  template <typename T>
+  void MatMul(const int M, const int N, const int K, const T* A, const T* B,
+              T* C) const;
+
   template <typename T>
   void MatMul(const framework::Tensor& mat_a, bool trans_a,
               const framework::Tensor& mat_b, bool trans_b, T alpha,
@@ -134,13 +143,25 @@ class Blas {
   template <typename T>
   void VADD(int n, const T* x, const T* y, T* z) const;
 
+  template <typename T>
+  void VMUL(int n, const T* x, const T* y, T* z) const;
+
   template <typename T>
   void VCOPY(int n, const T* x, T* y) const;
 
+  template <typename T>
+  void VEXP(int n, const T* x, T* y) const;
+
   template <typename T>
   void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
             T* C) const;
 
+  template <typename T>
+  T DOT(int n, const T* x, const T* y) const;
+
+  template <typename T>
+  void SCAL(int n, const T a, T* x) const;
+
   template <typename T>
   void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
@@ -202,16 +223,36 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template VADD<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VMUL(ARGS... args) const {
+    Base()->template VMUL<T>(args...);
+  }
+
   template <typename... ARGS>
   void VCOPY(ARGS... args) const {
     Base()->template VCOPY<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VEXP(ARGS... args) const {
+    Base()->template VEXP<T>(args...);
+  }
+
   template <typename... ARGS>
   void GEMV(ARGS... args) const {
     Base()->template GEMV<T>(args...);
   }
 
+  template <typename... ARGS>
+  T DOT(ARGS... args) const {
+    return Base()->template DOT<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void SCAL(ARGS... args) const {
+    Base()->template SCAL<T>(args...);
+  }
+
   template <typename... ARGS>
   void BatchedGEMM(ARGS... args) const {
     Base()->template BatchedGEMM<T>(args...);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index bf3382107960dfd8b52f94b421b49022dcb6d291..e1df78d11e41c5f74e244643f40c6d0581fa6a4a 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -73,6 +73,16 @@ struct CBlas<float> {
     platform::dynload::cblas_sgemv(args...);
   }
 
+  template <typename... ARGS>
+  static float DOT(ARGS... args) {
+    return platform::dynload::cblas_sdot(args...);
+  }
+
+  template <typename... ARGS>
+  static void SCAL(ARGS... args) {
+    platform::dynload::cblas_sscal(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_sgemm_batch(args...);
@@ -82,6 +92,16 @@ struct CBlas<float> {
   static void VADD(ARGS... args) {
     platform::dynload::vsAdd(args...);
   }
+
+  template <typename... ARGS>
+  static void VMUL(ARGS... args) {
+    platform::dynload::vsMul(args...);
+  }
+
+  template <typename... ARGS>
+  static void VEXP(ARGS... args) {
+    platform::dynload::vsExp(args...);
+  }
 };
 
 template <>
@@ -133,6 +153,16 @@ struct CBlas<double> {
     platform::dynload::cblas_dgemv(args...);
   }
 
+  template <typename... ARGS>
+  static double DOT(ARGS... args) {
+    return platform::dynload::cblas_ddot(args...);
+  }
+
+  template <typename... ARGS>
+  static void SCAL(ARGS... args) {
+    platform::dynload::cblas_dscal(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_dgemm_batch(args...);
@@ -142,6 +172,16 @@ struct CBlas<double> {
   static void VADD(ARGS... args) {
     platform::dynload::vdAdd(args...);
   }
+
+  template <typename... ARGS>
+  static void VMUL(ARGS... args) {
+    platform::dynload::vdMul(args...);
+  }
+
+  template <typename... ARGS>
+  static void VEXP(ARGS... args) {
+    platform::dynload::vdExp(args...);
+  }
 };
 
 #else
@@ -199,6 +239,10 @@ struct CBlas<platform::float16> {
   static void SMM_GEMM(...) {
     PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
   }
+  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
+  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
+  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
+  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
     PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -206,64 +250,6 @@ struct CBlas<platform::float16> {
 #endif
 };
 
-template <typename T>
-inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa,
-                    bool transb, const T &alpha, const T &beta) {
-#ifdef PADDLE_WITH_LIBXSMM
-  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
-  // But the threshold is custom
-  constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
-  if (m * n * k > LIBXSMM_THRESHOLD || transa || transb ||
-      std::abs<T>(alpha - static_cast<T>(1) >
-                  std::numeric_limits<T>::epsilon()) ||
-      std::abs<T>(beta) > std::numeric_limits<T>::epsilon()) {
-    return false;
-  } else {
-    return true;
-  }
-#endif
-  return false;
-}
-
-template <>
-inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
-                                       bool transa, bool transb,
-                                       const platform::float16 &alpha,
-                                       const platform::float16 &beta) {
-  return false;
-}
-
-template <typename T>
-inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
-                      CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha,
-                      const T *A, int lda, const T *B, int ldb, T beta, T *C,
-                      int ldc) {
-#ifdef PADDLE_WITH_LIBXSMM
-  if (UseXSMM<T>(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
-                 beta)) {
-    // Note: SMM use ColMajor
-    const char transa = 'N';
-    const char transb = 'N';
-    CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
-                       &beta, C, &ldc);
-    return;
-  }
-#endif
-
-#ifdef PADDLE_MKL_SPLIT_GEMM
-  constexpr int bs = 2;
-  if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) {
-    for (int off = 0; off < M; off += bs) {
-      CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha,
-                     A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc);
-    }
-    return;
-  }
-#endif
-  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-                 beta, C, ldc);
-}
-
 #ifdef PADDLE_WITH_MKLML
 template <>
 template <typename T>
@@ -308,8 +294,8 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
   int lda = (transA == CblasNoTrans) ? K : M;
   int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
-  GEMM_WARP<T>(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-               beta, C, ldc);
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
 }
 
 template <>
@@ -318,9 +304,20 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
                                             int N, int K, T alpha, const T *A,
                                             int lda, const T *B, int ldb,
                                             T beta, T *C, int ldc) const {
-  GEMM_WARP<T>(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-               lda, B, ldb, beta, C, ldc);
+  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+                 lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                            CBLAS_TRANSPOSE transB, int M,
+                                            int N, int K, T alpha, const T *A,
+                                            int lda, const T *B, int ldb,
+                                            T beta, T *C, int ldc) const {
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
 }
 
 template <typename DeviceContext>
@@ -374,6 +371,61 @@ void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
 #endif
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
+                                            T *z) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VMUL(n, x, y, z);
+#else
+  // try to find if openblas support vmul
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+#endif
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VEXP(n, x, y);
+#else
+  // try to find if openblas support vexp
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+#endif
+}
+
+template <>
+template <typename T>
+T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  return CBlas<T>::DOT(n, x, 1, y, 1);
+#else
+  // try to find if openblas support cblas_dot
+  T sum = 0;
+  for (int i = 0; i < n; ++i) {
+    sum += x[i] * y[i];
+  }
+  return sum;
+#endif
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::SCAL(n, a, x, 1);
+#else
+  // try to find if openblas support cblas_scal
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+#endif
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
@@ -415,6 +467,42 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
 #endif
 }
 
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::MatMul(const int M, const int N, const int K,
+                                 const T *A, const T *B, T *C) const {
+  this->template GEMM<T>(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
+                         static_cast<T>(1), A, K, B, N, static_cast<T>(0), C,
+                         N);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::MatMul(const int M, const int N,
+                                              const int K, const T *A,
+                                              const T *B, T *C) const {
+#ifdef PADDLE_WITH_LIBXSMM
+  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
+  // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
+
+  // Since the matrix is very small,
+  // so the unit of calculation is already very fast,
+  // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
+  // use xsmm directly.
+  // Note: SMM use ColMajor
+  const char transa = 'N';
+  const char transb = 'N';
+  const T alpha = static_cast<T>(1);
+  const T beta = static_cast<T>(0);
+  CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta,
+                     C, &N);
+  return;
+#endif
+
+  CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
+                 static_cast<T>(1), A, K, B, N, static_cast<T>(0), C, N);
+}
+
 template <typename DeviceContext>
 template <typename T>
 void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc
index 55c8a472aca7fe700ef6a3f96bed1496d7b12b80..fbe7c2978385401b35765101c87387ff727be4e0 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -71,7 +71,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const std::vector<const framework::LoDTensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
     size_t num = outputs->size();
diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu
index 5863d74fca21de8b77bc208fb95d8fd52562f7a7..820e73e779720e4f76168e0a84a254ef645784ee 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -189,7 +189,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const std::vector<const framework::LoDTensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
     int o_num = outputs->size();
diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h
index 9e080f2e8be23768dcea47b577043beef37b2eaf..e5d7d860b371677b3cfc67a57390bdee0d0ecc37 100644
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -57,7 +57,7 @@ template <typename DeviceContext, typename T>
 class ConcatGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const std::vector<const framework::LoDTensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs);
 };
 
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
new file mode 100644
index 0000000000000000000000000000000000000000..48c0da0e368a0fe6efcd758536e5659eeee26f7e
--- /dev/null
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+template <typename T>
+inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename T>
+inline T tanh(T x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_identity(const int n, const T* x, T* y) {
+  // do nothing
+  return;
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_sigmoid(const int n, const T* x, T* y) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = 1.0 / (1.0 + std::exp(-tmp));
+  }
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_tanh(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = tanh<T>(x[i]);
+  }
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <>
+inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
+                                                 float* y) {
+  // TODO(TJ): complete me
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <>
+inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
+                                                float* y) {
+  // TODO(TJ): complete me
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+class VecActivations {
+ public:
+  std::function<void(const int, const T*, T*)> operator()(
+      const std::string& type) {
+    if (type == "sigmoid") {
+      return vec_sigmoid<T, isa>;
+    } else if (type == "relu") {
+      return vec_relu<T, isa>;
+    } else if (type == "tanh") {
+      return vec_tanh<T, isa>;
+    } else if (type == "identity" || type == "") {
+      return vec_identity<T, isa>;
+    }
+    PADDLE_THROW("Not support type %s.", type);
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 58b85abf822741905a4e9547823b6cdbe645d39a..0de58d5fddd84d33f708c4c73e5a19dc2fe8a86b 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -15,25 +15,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T>
-HOSTDEVICE T log(const T& val) {
-  return std::log(val);
-}
-
-template <>
-HOSTDEVICE platform::float16 log(const platform::float16& val) {
-  // strage bug, hlog is not exists.
-  return static_cast<float16>(0);
-  // half tmp = static_cast<half>(val);
-  // return static_cast<platform::float16>(hlog(tmp));
-}
-
 namespace {
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
@@ -49,12 +35,12 @@ template <typename T>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                        const int class_num) {
   int tid = threadIdx.x;
-  T val(0);
+  T val = 0;
 
   int idx = blockIdx.x * class_num + tid;
   int end = blockIdx.x * class_num + class_num;
   for (; idx < end; idx += blockDim.x) {
-    val += math::TolerableValue<T>()(log(X[idx])) * label[idx];
+    val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
   }
 
   val = paddle::platform::reduceSum(val, tid, blockDim.x);
@@ -98,8 +84,6 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 
 template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
 template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext,
-                                   platform::float16>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index 2e4e4781c2eee1d9a0fc6760093a424ab3d5eb9d..adc5b3fe47cd3bf524eb56747b6bd51e345a2eb6 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
@@ -35,21 +33,6 @@ struct TolerableValue {
   }
 };
 
-// float16 value clip behave different.
-using paddle::platform::float16;
-using paddle::platform::isfinite;
-template <>
-struct TolerableValue<float16> {
-  HOSTDEVICE float16 operator()(const float16& x) const {
-    if (isfinite(x))
-      return x;
-    else if (x > static_cast<float16>(0))
-      return std::numeric_limits<float16>::max();
-    else
-      return std::numeric_limits<float16>::min();
-  }
-};
-
 template <typename DeviceContext, typename T>
 class CrossEntropyFunctor {
  public:
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f5a49c0ab5a10b0d7dc1febd258ce76c467cb1c
--- /dev/null
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/math/blas.h"
+
+DECLARE_int32(paddle_num_threads);
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
+                      const int N, const int K, const T* X, const T* W, T* Y,
+                      const T* B = NULL, bool relu = false) {
+  blas.MatMul(M, N, K, X, W, Y);
+  if (B == NULL) {
+    return;
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
+#endif
+  for (int i = 0; i < M; i++) {
+    blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
+  }
+
+  if (!relu) {
+    return;
+  }
+
+  // TODO(TJ): fuse relu
+  LOG(FATAL) << "Not implemented!";
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 00dbfc11a239da70ec81e3498d2f4d5e5bf1c63f..a92762c7fea865fad2c7784736cce93a8af21892 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -77,7 +76,6 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
 
 template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
 
 namespace {
 template <typename T, int block_size>
@@ -122,7 +120,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     auto* out_data = output->data<T>();
 
     SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(context, output, static_cast<T>(0));
+    functor(context, output, 0.0);
 
     const int block_size = 256;
     dim3 threads(block_size, 1);
@@ -140,8 +138,6 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
 
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
-                                      platform::float16>;
 
 template <typename T>
 struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
@@ -181,8 +177,6 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext,
-                                  platform::float16>;
 
 namespace {
 template <typename T, int block_size>
@@ -235,8 +229,6 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
-                                        platform::float16>;
 
 namespace scatter {
 
@@ -284,7 +276,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
         context.GetPlace());
 
     math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), static_cast<T>(0));
+    constant_functor(context, out.mutable_value(), 0.0);
 
     auto* out_data = out.mutable_value()->data<T>();
     auto* input_data = input.value().data<T>();
@@ -308,7 +300,6 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
 template struct MergeAdd<platform::CUDADeviceContext, double>;
 template struct MergeAdd<platform::CUDADeviceContext, int>;
 template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
-template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
 
 template <typename T, int block_size>
 __global__ void UpdateToTensorKernel(const T* selected_rows,
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 785c4baecbf056d08930f4bb704aec067a2db4a2..3effe776258cb541dbba32f63eda457d917011f4 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -94,15 +94,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
 template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<double>;
-template class SoftmaxGradCUDNNFunctor<platform::float16>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 
 template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext,
-                                  platform::float16>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 07aa23754f9786c56c0be14c2a71d5290d2cccf7..91e0ab28efc21d4376524c8ecf66b429d51d8847 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -12,16 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#define EIGEN_USE_GPU
+
 #include "paddle/fluid/operators/mean_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index a41d50ae0b99797800078184f7ffeb366367f493..362e9f9ae8b2f0f77198e3f3939211ae1117b27b 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
     IG->mutable_data<T>(context.GetPlace());
 
     T ig_size = static_cast<T>(IG->numel());
-    Eigen::DSizes<int, 1> bcast(static_cast<int>(ig_size));
+    Eigen::DSizes<int, 1> bcast(ig_size);
 
     EigenVector<T>::Flatten(*IG).device(
         *context.template device_context<DeviceContext>().eigen_device()) =
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 51993398bd3427e1f0da155918395bc50fa65e45..2a8e4af516ce9341772d4668dc993215b4aae24d 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -54,9 +54,9 @@ class MulOp : public framework::OperatorWithKernel {
     auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
     auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
 
-    PADDLE_ENFORCE_EQ(
-        x_mat_dims[1], y_mat_dims[0],
-        "First matrix's width must be equal with second matrix's height.");
+    PADDLE_ENFORCE_EQ(x_mat_dims[1], y_mat_dims[0],
+                      "First matrix's width must be equal with second matrix's "
+                      "height. %s, %s");
     std::vector<int64_t> output_dims;
     output_dims.reserve(
         static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc
index 6c5a83c6a50c463502171f09bbf18e17e43917b5..81f3e42bf412fa4d2cb48405f2f8ee49b6aa0b67 100644
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -20,7 +20,6 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
                         ops::MulKernel<plat::CUDADeviceContext, double>,
                         ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
-    ops::MulGradKernel<plat::CUDADeviceContext, double>,
-    ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(mul_grad,
+                        ops::MulGradKernel<plat::CUDADeviceContext, float>,
+                        ops::MulGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
index 15dd975e3bbf80b2e616e6628555e812d025f70a..f72824806ed6ee3a4490938403d441326f8a3d4a 100644
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -62,23 +62,31 @@ class MulGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
     int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor x_matrix = x->dims().size() > 2
-                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
-                                : *x;
-    const Tensor y_matrix = y->dims().size() > 2
-                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
-                                : *y;
-    const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto x_matrix = x->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                        : static_cast<const Tensor&>(*x);
+    auto y_matrix = y->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                        : static_cast<const Tensor&>(*y);
+    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
 
     Tensor dout_mat;
     dout_mat.ShareDataWith(*dout);
     dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
                      framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
 
-    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     if (dx) {
diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
index ce0ddd89bfb0d73e237a6f9a777376624d8ef2d4..cdcba8035762d8f442eb8b8ed52a4e3e99ac31b6 100644
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
@@ -1,3 +1,3 @@
-if(WITH_GPU)
+if(WITH_GPU AND NOT WIN32)
   nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
 endif()
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 9fdbee818a217842e47c8ab11b84c6d5513ad219..31f083565fddee66aea1485ed71f41b6199f4502 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -174,8 +174,7 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
+                   ops::PoolCUDNNGradOpKernel<double>);
 
 REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<float>,
@@ -183,5 +182,4 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
+                   ops::PoolCUDNNGradOpKernel<double>);
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 4a6ce938a5f337d035b21f562d46daf606236db0..a1f368e8690512cec2db7593aabc0279bbe174eb 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -57,6 +57,8 @@ class RecvOp : public framework::OperatorBase {
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
+    AddInput("X", "(Any) Dummy inputs, used for control dependency")
+        .AsDuplicable();
     AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
     AddComment(R"DOC(
 Recv operator
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index f730a9746da56ca82090122193ec54efb774483e..01308e416a9313bad13ded4e40c79bb0550e31ed 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -54,7 +54,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
         static_cast<T>(context.Attr<float>("max")));
 
     std::vector<T> ids(batch_size);
-    for (size_t i = 0; i < batch_size; ++i) {
+    for (int i = 0; i < batch_size; ++i) {
       T r = dist(engine);
       int idx = width - 1;
       for (int j = 0; j < width; ++j) {
@@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
           break;
         }
       }
-      ids[i] = ins_vector[i * width + idx];
+      ids[i] = ins_vector[idx];
     }
 
     std::vector<int64_t> out_dim;
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 201a51130d6b6f94104e2dabf9e7facffa672ae0..85de37416b5f24128ee98320a872eafffe967c81 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -142,6 +142,8 @@ class SaveOp : public framework::OperatorBase {
     std::string filename = lt_var->data();
     VLOG(4) << "SaveSelectedRows get File name: " << filename;
 
+    MkDirRecursively(DirName(filename).c_str());
+
     auto &selectedRows = var->Get<framework::SelectedRows>();
 
     // get device context from pool
diff --git a/paddle/fluid/operators/scale_op.cu b/paddle/fluid/operators/scale_op.cu
index d266867046334f95eaaf4b7a9acb3fec20f1e439..04c802da12958a53626f533833c2709110531136 100644
--- a/paddle/fluid/operators/scale_op.cu
+++ b/paddle/fluid/operators/scale_op.cu
@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/platform/float16.h"
 
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     scale,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   plat::float16>);
+                                   int64_t>);
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index bf5e0d864495ce3a651a31c9d5a7664fe9eb2396..c32d2603cf76f55a9e723196977b0a70c92d597a 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -81,8 +81,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "The source input of scatter op");
     AddInput("Ids", "The index input of scatter op where X will be updated");
-    AddInput("Updates", "The updated value of updates op");
-    AddOutput("Out", "The output of add op");
+    AddInput("Updates", "The updated value of scatter op");
+    AddOutput("Out", "The output of scatter op");
     AddComment(R"DOC(
 Scatter Operator.
 
@@ -90,7 +90,7 @@ This operator obtains output by updating the input on selected indices on the fi
 
 $$
 Out = X \\
-Out[Ids] = X[Ids] + Updates
+Out[Ids] = Updates
 $$
 
 )DOC");
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index 181bb1af5cce7fad228e61e1a76ed66a9bd61b3e..2eefbba9726af4d38b40d91e9242faa2923dca20 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -34,9 +34,9 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     auto *Updates = ctx.Input<Tensor>("Updates");
     auto *Out = ctx.Output<Tensor>("Out");
 
-    // In place output: Out = X, Out[Ids] += Updates
+    // In place output: Out = X, Out[Ids] = Updates
     framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    // Apply ScatterUpdate: Out[index] += Updates[:]
+    // Apply ScatterUpdate: Out[index] = Updates[:]
     ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
   }
 };
@@ -55,7 +55,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     // In place gradient: dX = dO
     framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
     dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates += dO[Ids]
+    // Gradient by Gather: dUpdates = dO[Ids]
     CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
   }
 };
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 1866a86048acbefadcb4d82cd6309cd16f0352d6..14b07649c416ff1b671fc9b5ee4eb956b44570c5 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -37,22 +37,19 @@ class SendBarrierOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-    bool sync_mode = Attr<bool>("sync_mode");
 
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
-    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
+    VLOG(3) << "SendBarrierOp sync";
 
     // need to wait before sending send_barrier message
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
-    if (sync_mode) {
-      for (auto& ep : eps) {
-        VLOG(3) << "send barrier, ep: " << ep;
-        rpc_client->AsyncSendBatchBarrier(ep);
-      }
-      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+    for (auto& ep : eps) {
+      VLOG(3) << "send barrier, ep: " << ep;
+      rpc_client->AsyncSendBatchBarrier(ep);
     }
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
 };
 
@@ -70,7 +67,6 @@ the Parameter Server would knew all variables have been sent.
                                       "(string vector, default 127.0.0.1:6164)"
                                       "Server endpoints to send variables to.")
         .SetDefault({"127.0.0.1:6164"});
-    AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
   }
 };
 
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 3cd42f2d059532b7090e66ce21de8e5cb014adf1..82a70e4bf13247d784371ffdf419c9f792d7f721 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -66,6 +66,8 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() {
     AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
         .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
     AddComment(R"DOC(
 Send operator
 
diff --git a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
index 0ddacb57106c090e8f4f9350a65a30ca102f8e0a..7aca9f7111956dba63e2ceee10077879fe092bdf 100644
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
@@ -68,7 +68,9 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
     auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = ctx.Input<LoDTensor>("X");
     auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-
+    if (x_grad) {
+      x_grad->set_lod(x->lod());
+    }
     auto lod = x->lod();
     const size_t level = lod.size() - 1;
 
diff --git a/paddle/fluid/operators/sequence_softmax_op.h b/paddle/fluid/operators/sequence_softmax_op.h
index cb93a02b8386ed50ff176fc25b88449b7eb16902..bca564e16f9951519eefe25126aadebb4c1326b6 100644
--- a/paddle/fluid/operators/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_softmax_op.h
@@ -66,6 +66,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
     auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = ctx.Input<LoDTensor>("X");
     auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    if (x_grad) {
+      x_grad->set_lod(x->lod());
+    }
 
     auto lod = x->lod();
     const size_t level = lod.size() - 1;
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index 2685ce217ee0f0d3e89f3751e96218dcd19bead4..d8b0165b2a89b04bd55671a37d96ee4ba275b2eb 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -111,7 +111,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < grad.rows().size(); i++) {
         PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                        "Input rows index should less than height");
-        int64_t id_index = param.Index(grad.rows()[i]);
+        int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
         PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
                           "id should be in the table");
         for (int64_t j = 0; j < grad_row_width; j++) {
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index c2d45c3d2ef82683352afe0e72f0330f7cd753f6..2bdb23e999621b10799b5163f326bc4b66a437e6 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -78,5 +78,4 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
                    ops::SoftmaxCUDNNKernel<float>,
                    ops::SoftmaxCUDNNKernel<plat::float16>);
 REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>,
-                   ops::SoftmaxGradCUDNNKernel<plat::float16>);
+                   ops::SoftmaxGradCUDNNKernel<float>);
diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
index 19359b7eef5126d84f0707d39095a74ae4561186..5fb4f011d9b47cebc4a23bcce47eada825263343 100644
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -23,5 +23,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 6c507baf3a0ab0a557d29a53700685753616193b..8a683116b8054de12fc4419b5aa5fbc019b675bb 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -23,9 +23,9 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SqueezeOp should not be null.");
+                   "Input(X) of Squeeze operator should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SqueezeOp should not be null.");
+                   "Output(Out) of Squeeze operator should not be null.");
 
     const auto &x_dims = ctx->GetInputDim("X");
     // Check input tensor dims (<6) Eigen limit.
@@ -107,7 +107,6 @@ class SqueezeOp : public framework::OperatorBase {
 
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(out_dims);
-    attrs["inplace"] = Attr<bool>("inplace");
     // Invoke Reshape Op
     auto reshape_op = framework::OpRegistry::CreateOp(
         "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
@@ -125,12 +124,6 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
                               "(std::vector<int>). List of integers,"
                               " indicating the dimensions to squeeze.")
         .SetDefault({});
-    AddAttr<bool>("inplace",
-                  "(default: false) Squeeze the source tensor's shape without "
-                  "memory copy. When Attr(inplace) is set true, the output "
-                  "tensor shares memory with Input(X), otherwise, a new output "
-                  "tensor is created, and its data are copied from Input(x).")
-        .SetDefault(false);
     AddComment(R"DOC(
         Squeeze Operator.
         
@@ -180,7 +173,6 @@ class SqueezeGradOp : public framework::OperatorBase {
     auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(x_dims);
-    attrs["inplace"] = Attr<bool>("inplace");
 
     auto reshape_op = framework::OpRegistry::CreateOp(
         "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f4b48bc7391def082c82ed451fc5a752009a2f1
--- /dev/null
+++ b/paddle/fluid/operators/stack_op.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/stack_op.h"
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
+                  ops::StackGradOpDescMaker);
+REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
+
+REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel<plat::CPUDeviceContext, float>,
+                       ops::StackKernel<plat::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(stack_grad,
+                       ops::StackGradKernel<plat::CPUDeviceContext, float>,
+                       ops::StackGradKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/stack_op.cu
similarity index 53%
rename from paddle/fluid/operators/fill_constant_op.cu.cc
rename to paddle/fluid/operators/stack_op.cu
index 51ccaefa4338dfa18d26441a59d5fed2b9fa0c39..92c1bde2bcf089e5c715e90e564408e6ad37ba17 100644
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/stack_op.cu
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/operators/stack_op.h"
 
+namespace plat = paddle::platform;
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>)
+
+REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>,
+                        ops::StackKernel<plat::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(stack_grad,
+                        ops::StackGradKernel<plat::CUDADeviceContext, float>,
+                        ops::StackGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c777d5feaec1c3a6216b01359a250072a674b700
--- /dev/null
+++ b/paddle/fluid/operators/stack_op.h
@@ -0,0 +1,278 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+
+#ifdef __NVCC__
+#include <thrust/device_vector.h>
+#include "paddle/fluid/framework/array.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class StackOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_GT(ctx->Inputs("X").size(), 0,
+                      "Number of Inputs(X) must be larger than 0");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist.");
+
+    auto input_dims = ctx->GetInputsDim("X");
+    for (size_t i = 1; i < input_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
+                        "Dims of all Inputs(X) must be the same");
+    }
+
+    // Only lod of X[0] would be shared with Y
+    ctx->ShareLoD("X", /*->*/ "Y");
+
+    int axis = ctx->Attrs().Get<int>("axis");
+    int rank = input_dims[0].size();
+    PADDLE_ENFORCE(
+        axis >= -(rank + 1) && axis < rank + 1,
+        "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank);
+    if (axis < 0) axis += (rank + 1);
+
+    auto vec = framework::vectorize2int(input_dims[0]);
+    vec.insert(vec.begin() + axis, input_dims.size());
+    ctx->SetOutputDim("Y", framework::make_ddim(vec));
+  }
+};
+
+class StackOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input of stack op.").AsDuplicable();
+    AddOutput("Y", "The output of stack op.");
+    AddAttr<int>("axis",
+                 "The axis along which all of the Inputs(X) should be stacked.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+      Stack Operator.
+
+      Stack all of the Inputs(X) into one tensor along Attr(axis). The dims of all Inputs(X) must be the same.
+    )DOC");
+  }
+};
+
+template <typename VecXType, typename T>
+struct StackFunctor {
+  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
+      : x_(x), y_(y), n_(n), post_(post) {}
+
+  HOSTDEVICE void operator()(int idx) {
+    int i = idx / (n_ * post_);
+    int which_x = idx / post_ - i * n_;
+    int x_index = i * post_ + idx % post_;
+    y_[idx] = x_[which_x][x_index];
+  }
+
+ private:
+  VecXType x_;
+  T *y_;
+  int n_;
+  int post_;
+};
+
+template <typename VecDxType, typename T>
+struct StackGradFunctor {
+  HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
+      : dx_(dx), dy_(dy), n_(n), post_(post) {}
+
+  HOSTDEVICE void operator()(int idx) {
+    int i = idx / (n_ * post_);
+    int which_x = idx / post_ - i * n_;
+    int x_index = i * post_ + idx % post_;
+    dx_[which_x][x_index] = dy_[idx];
+  }
+
+ private:
+  VecDxType dx_;
+  const T *dy_;
+  int n_;
+  int post_;
+};
+
+template <typename DeviceContext, typename VecXType, typename T>
+static inline void StackFunctorForRange(const DeviceContext &ctx,
+                                        const VecXType &x, T *y, int total_num,
+                                        int n, int post) {
+  platform::ForRange<DeviceContext> for_range(ctx, total_num);
+  for_range(StackFunctor<VecXType, T>(x, y, n, post));
+}
+
+template <typename DeviceContext, typename VecDxType, typename T>
+static inline void StackGradFunctorForRange(const DeviceContext &ctx,
+                                            const VecDxType &dx, const T *dy,
+                                            int total_num, int n, int post) {
+  platform::ForRange<DeviceContext> for_range(ctx, total_num);
+  for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
+}
+
+template <typename DeviceContext, typename T>
+class StackKernel : public framework::OpKernel<T> {
+  using Tensor = framework::LoDTensor;
+
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto x = ctx.MultiInput<Tensor>("X");
+    auto *y = ctx.Output<Tensor>("Y");
+
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += (x[0]->dims().size() + 1);
+
+    int n = static_cast<int>(x.size());
+    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
+    std::vector<const T *> x_datas(n);
+    for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
+
+    int pre = 1, post = 1;
+    auto &dim = x[0]->dims();
+    for (auto i = 0; i < axis; ++i) pre *= dim[i];
+    for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
+    int total_num = pre * n * post;
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    constexpr auto kMaxThreshold = 16;
+    if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
+        n > kMaxThreshold) {
+#ifdef __NVCC__
+      VLOG(10) << "Stack more than " << kMaxThreshold
+               << " tensors on GPU may be slow.";
+      thrust::device_vector<const T *> device_x_vec(x_datas);
+      auto x_data_arr = device_x_vec.data().get();
+#else
+      auto x_data_arr = x_datas.data();
+#endif
+      StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
+#ifdef __NVCC__
+      // Wait() must be called because device_x_vec may be destructed before
+      // kernel ends
+      dev_ctx.Wait();
+#endif
+    }
+#ifdef __NVCC__
+    else {  // NOLINT
+      framework::Array<const T *, kMaxThreshold> x_data_arr;
+      for (int i = 0; i < n; ++i) x_data_arr[i] = x_datas[i];
+      StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
+    }
+#endif
+  }
+};
+
+class StackOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@Grad) must exist.");
+
+    int axis = ctx->Attrs().Get<int>("axis");
+    auto dy_dim = ctx->GetInputDim(framework::GradVarName("Y"));
+    int rank = dy_dim.size();
+    PADDLE_ENFORCE(axis >= -rank && axis < rank,
+                   "Attr(axis) must be inside [-rank, rank), where rank = %d",
+                   rank);
+    if (axis < 0) axis += rank;
+
+    PADDLE_ENFORCE_EQ(ctx->Outputs(framework::GradVarName("X")).size(),
+                      static_cast<size_t>(dy_dim[axis]),
+                      "Number of Outputs(X@Grad) is wrong");
+    auto vec = framework::vectorize2int(dy_dim);
+    vec.erase(vec.begin() + axis);
+    ctx->SetOutputsDim(
+        framework::GradVarName("X"),
+        std::vector<framework::DDim>(dy_dim[axis], framework::make_ddim(vec)));
+  }
+};
+
+class StackGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("stack_grad");
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class StackGradKernel : public framework::OpKernel<T> {
+  using Tensor = framework::LoDTensor;
+
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += dy->dims().size();
+
+    int n = dy->dims()[axis];
+    std::vector<T *> dx_datas(n);  // NOLINT
+    for (int i = 0; i < n; i++) {
+      dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
+    }
+    auto dy_data = dy->data<T>();
+
+    int pre = 1;
+    for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
+    int total_num = dy->numel();
+    int post = total_num / (n * pre);
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    constexpr auto kMaxThreshold = 16;
+    if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
+        n > kMaxThreshold) {
+#ifdef __NVCC__
+      VLOG(10) << "Stack more than " << kMaxThreshold
+               << " tensors on GPU may be slow.";
+      thrust::device_vector<T *> device_dx_vec(dx_datas);
+      auto dx_data_arr = device_dx_vec.data().get();
+#else
+      auto dx_data_arr = dx_datas.data();
+#endif
+      StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,
+                               post);
+#ifdef __NVCC__
+      // Wait() must be called because device_dx_vec may be destructed before
+      // kernel ends
+      dev_ctx.Wait();
+#endif
+    }
+#ifdef __NVCC__
+    else {  // NOLINT
+      framework::Array<T *, kMaxThreshold> dx_data_arr;
+      for (int i = 0; i < n; ++i) dx_data_arr[i] = dx_datas[i];
+      StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,
+                               post);
+    }
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc
index d2035777ee2289291a02594ee289156504df09d9..f9a16ef35ecb9eeb6c8eda9d124ecb17e7f9d5ce 100644
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
@@ -34,15 +34,15 @@
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
-using paddle::platform::CPUDeviceContext;
 using framework::DataLayout;
 using mkldnn::memory;
 using mkldnn::primitive;
+using mkldnn::reorder;
 using mkldnn::stream;
 using mkldnn::sum;
-using mkldnn::reorder;
+using paddle::framework::Tensor;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::MKLDNNDeviceContext;
 using platform::to_void_cast;
 
 template <typename T>
@@ -175,18 +175,35 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         auto& sel_row = get_selected_row(i);
         first_dim += sel_row.rows().size();
       }
-      auto in_dim =
-          framework::vectorize(get_selected_row(N - 1).value().dims());
+
+      std::vector<int64_t> in_dim;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        if (sel_row.rows().size() > 0) {
+          in_dim = framework::vectorize(sel_row.value().dims());
+          break;
+        }
+      }
+
+      if (in_dim.empty()) {
+        VLOG(3) << "WARNING: all the inputs are empty";
+        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+      } else {
+        in_dim[0] = static_cast<int64_t>(first_dim);
+      }
+
       in_dim[0] = static_cast<int64_t>(first_dim);
 
       out_value->Resize(framework::make_ddim(in_dim));
 
+      out_value->mutable_data<T>(ctx.GetPlace());
+
       // if all the input sparse vars are empty, no need to
       // merge these vars.
       if (first_dim == 0UL) {
         return;
       }
-      out_value->mutable_data<T>(ctx.GetPlace());
+
       math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
       int64_t offset = 0;
       for (int i = 0; i < N; i++) {
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index db4c2d6c115f04b436db00854ca4b02fea09866b..89bcd1bbc86dc29cb7b98cbef3057a8f98c74555 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -11,13 +11,10 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sum_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index dda6772796c821ffb813e73da0c34370e5339001..6dffe527c1072ee97fcde1725bfc1a47ed1ad74a 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> {
       if (!in_place) {
         math::SetConstant<DeviceContext, T> constant_functor;
         constant_functor(context.template device_context<DeviceContext>(), out,
-                         static_cast<T>(0));
+                         0.0);
       }
 
       math::SelectedRowsAddToTensor<DeviceContext, T> functor;
@@ -105,18 +105,30 @@ class SumKernel : public framework::OpKernel<T> {
         auto &sel_row = get_selected_row(i);
         first_dim += sel_row.rows().size();
       }
-      auto in_dim =
-          framework::vectorize(get_selected_row(N - 1).value().dims());
-      in_dim[0] = static_cast<int64_t>(first_dim);
+
+      std::vector<int64_t> in_dim;
+      for (int i = 0; i < N; i++) {
+        auto &sel_row = get_selected_row(i);
+        if (sel_row.rows().size() > 0) {
+          in_dim = framework::vectorize(sel_row.value().dims());
+          break;
+        }
+      }
+      if (in_dim.empty()) {
+        VLOG(3) << "WARNING: all the inputs are empty";
+        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+      } else {
+        in_dim[0] = static_cast<int64_t>(first_dim);
+      }
 
       out_value->Resize(framework::make_ddim(in_dim));
+      out_value->mutable_data<T>(context.GetPlace());
 
       // if all the input sparse vars are empty, no need to
       // merge these vars.
       if (first_dim == 0UL) {
         return;
       }
-      out_value->mutable_data<T>(context.GetPlace());
 
       math::SelectedRowsAddTo<DeviceContext, T> functor;
 
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index ee3078876c15b06a887064f08dc0c05d450b5f77..1048d3017140c9e31426a1580b2862667116a024 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -17,112 +17,16 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
 
 DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
+DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");
 
 namespace operators {
 
-using inference::Singleton;
-using inference::tensorrt::TRT_EngineManager;
-
-using FluidDT = framework::proto::VarType_Type;
-using TRT_DT = nvinfer1::DataType;
-
-namespace {
-
-TRT_DT FluidDataType2TRT(FluidDT type) {
-  switch (type) {
-    case FluidDT::VarType_Type_FP32:
-      return TRT_DT::kFLOAT;
-    case FluidDT::VarType_Type_INT32:
-      return TRT_DT::kINT32;
-    default:
-      return TRT_DT::kINT32;
-  }
-  PADDLE_THROW("unkown type");
-  return TRT_DT::kINT32;
-}
-
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
-  PADDLE_ENFORCE_GT(shape.size(), 1UL,
-                    "TensorRT' tensor input requires at least 2 dimensions");
-  PADDLE_ENFORCE_LE(shape.size(), 4UL,
-                    "TensorRT' tensor input requires at most 4 dimensions");
-  PADDLE_ENFORCE_EQ(shape.size(), 4UL);
-  return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
-}
-
-}  // namespace
-
-template <typename DeviceContext, typename T>
-void TensorRTEngineKernel<DeviceContext, T>::Prepare(
-    const framework::ExecutionContext &context) const {
-  VLOG(4) << "Prepare engine";
-  // Get the ProgramDesc and pass to convert.
-  framework::proto::BlockDesc block_desc;
-  block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-  int max_batch = context.Attr<int>("max_batch");
-  auto max_workspace = context.Attr<int>("max_workspace");
-  auto params = context.Attr<std::vector<std::string>>("parameters");
-  std::unordered_set<std::string> parameters;
-  for (const auto &param : params) {
-    parameters.insert(param);
-  }
-
-  std::vector<std::string> output_maps =
-      context.Attr<std::vector<std::string>>("output_name_mapping");
-
-  // TODO(Superjomn) replace this with a different stream
-  auto *engine = Singleton<TRT_EngineManager>::Global().Create(
-      max_batch, max_workspace, nullptr /*engine hold its own stream*/,
-      context.Attr<std::string>("engine_uniq_key"));
-  engine->InitNetwork();
-
-  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
-  VLOG(4) << "parsed var size " << block.AllVars().size();
-  // Add inputs
-  VLOG(4) << "declare inputs";
-  for (auto &input : context.Inputs("Xs")) {
-    if (parameters.count(input)) continue;
-    VLOG(4) << "declare input " << input;
-    auto *var = block.FindVar(input);
-    // TensorRT engine need to create parameters. The parameter's description
-    // should be set in
-    PADDLE_ENFORCE(var, "no variable called %s", input);
-    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                      "TensorRT engine only takes LoDTensor as input");
-    auto shape = var->GetShape();
-    // For the special batch_size placeholder -1, drop it and pass the real
-    // shape of data.
-    // TODO(Superjomn) fix this with batch broadcast, or it can't handle
-    // variational batch size.
-    if (shape[0] == -1) {
-      shape[0] = FLAGS_tensorrt_engine_batch_size;
-    }
-    engine->DeclareInput(
-        input, FluidDataType2TRT(
-                   var->Proto()->type().lod_tensor().tensor().data_type()),
-        Vec2TRT_Dims(shape));
-  }
-
-  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block_desc, parameters, context.scope(), engine);
-
-  // Add outputs
-  for (auto &output : output_maps) {
-    engine->DeclareOutput(output);
-  }
-
-  engine->FreezeNetwork();
-}
-
 class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -130,8 +34,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
     AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
-    AddAttr<int>("max_batch", "the maximum batch size.");
-    AddAttr<int>("max_workspace", "the maximum batch size.");
     AddComment("TensorRT engine operator.");
   }
 };
@@ -150,11 +52,4 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
                   ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    tensorrt_engine,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cu.cc b/paddle/fluid/operators/tensorrt_engine_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1ddfde6d51ef719ca0b89cf286b176195ee682a
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    tensorrt_engine,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 2cbe1213a2f428a3ce56b06f97636baeb4b66c26..bc556ab3643cefa3e45d2a8a3835937753af723f 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -19,16 +19,51 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 
 namespace paddle {
 
 DECLARE_int32(tensorrt_engine_batch_size);
+DECLARE_int32(tensorrt_max_batch_size);
+DECLARE_int32(tensorrt_workspace_size);
 
 namespace operators {
 
+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+
+namespace {
+
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
+  if (shape.size() == 4UL)
+    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
+  return nvinfer1::DimsCHW(shape[1], 1, 1);
+}
+
+}  // namespace
+
 using inference::Singleton;
 using inference::tensorrt::TRT_EngineManager;
 
@@ -47,7 +82,7 @@ class TensorRTEngineOp : public framework::OperatorWithKernel {
                                   .FindVar(input0)
                                   ->GetMutable<framework::LoDTensor>()
                                   ->type()),
-        platform::CPUPlace());
+        ctx.GetPlace());
     return kt;
   }
 };
@@ -64,7 +99,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     auto input_names = context.op().Inputs("Xs");
     PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
     PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
-                      context.Attr<int>("max_batch"));
+                      FLAGS_tensorrt_max_batch_size);
 
     std::vector<std::string> output_maps =
         context.Attr<std::vector<std::string>>("output_name_mapping");
@@ -94,12 +129,19 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
 
     // Convert output tensor from engine to fluid
     int output_index = 0;
+    VLOG(4) << "TensorRT Engine Op Outputs:";
     for (const auto& y : context.Outputs("Ys")) {
+      VLOG(4) << y;
       // convert output and copy to fluid.
       nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
-      std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
+      // The ITensor doesn't contain the batch size dim.
+      std::vector<int> ddim;
+      ddim.push_back(FLAGS_tensorrt_engine_batch_size);
+      for (int i = 0; i < dims.nbDims; i++) {
+        ddim.push_back(dims.d[i]);
+      }
 
       auto* fluid_v = context.scope().FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
@@ -113,9 +155,11 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       // TODO(Superjomn) change this float to dtype size.
       auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
                   FLAGS_tensorrt_engine_batch_size;
-      engine->GetOutputInCPU(output_maps[output_index],
-                             fluid_t->mutable_data<float>(platform::CPUPlace()),
-                             size * sizeof(float));
+      engine->GetOutputInGPU(
+          output_maps[output_index],
+          fluid_t->mutable_data<float>(platform::CUDAPlace(
+              boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
+          size * sizeof(float));
       //} else {
       // engine->GetOutputInGPU(
       // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
@@ -128,8 +172,67 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
   }
 
  protected:
-  // Build the engine.
-  void Prepare(const framework::ExecutionContext& context) const;
+  void Prepare(const framework::ExecutionContext& context) const {
+    VLOG(4) << "Prepare engine";
+    // Get the ProgramDesc and pass to convert.
+    framework::proto::BlockDesc block_desc;
+    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
+    int max_batch = FLAGS_tensorrt_max_batch_size;
+    auto max_workspace = FLAGS_tensorrt_workspace_size;
+    auto params = context.Attr<std::vector<std::string>>("parameters");
+    std::unordered_set<std::string> parameters;
+    for (const auto& param : params) {
+      parameters.insert(param);
+    }
+
+    std::vector<std::string> output_maps =
+        context.Attr<std::vector<std::string>>("output_name_mapping");
+
+    // TODO(Superjomn) replace this with a different stream
+    auto* engine = Singleton<TRT_EngineManager>::Global().Create(
+        max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+        context.Attr<std::string>("engine_uniq_key"),
+        boost::get<platform::CUDAPlace>(context.GetPlace()).device);
+
+    engine->InitNetwork();
+
+    framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+    VLOG(4) << "parsed var size " << block.AllVars().size();
+    // Add inputs
+    VLOG(4) << "declare inputs";
+    for (auto& input : context.Inputs("Xs")) {
+      if (parameters.count(input)) continue;
+      VLOG(4) << "declare input " << input;
+      auto* var = block.FindVar(input);
+      // TensorRT engine need to create parameters. The parameter's description
+      // should be set in
+      PADDLE_ENFORCE(var, "no variable called %s", input);
+      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                        "TensorRT engine only takes LoDTensor as input");
+      auto shape = var->GetShape();
+      // For the special batch_size placeholder -1, drop it and pass the real
+      // shape of data.
+      // TODO(Superjomn) fix this with batch broadcast, or it can't handle
+      // variational batch size.
+      if (shape[0] == -1) {
+        shape[0] = FLAGS_tensorrt_engine_batch_size;
+      }
+      engine->DeclareInput(
+          input, FluidDataType2TRT(
+                     var->Proto()->type().lod_tensor().tensor().data_type()),
+          Vec2TRT_Dims(shape));
+    }
+
+    inference::Singleton<inference::tensorrt::OpConverter>::Global()
+        .ConvertBlock(block_desc, parameters, context.scope(), engine);
+
+    // Add outputs
+    for (auto& output : output_maps) {
+      engine->DeclareOutput(output);
+    }
+
+    engine->FreezeNetwork();
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 37657fa0b0498986fe67027415279af1775e58b9..27c1d29762b3de5e57f877b271aae52e71eb7cf9 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -23,20 +24,20 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
-USE_CPU_ONLY_OP(tensorrt_engine);
+USE_CUDA_ONLY_OP(tensorrt_engine);
 
 namespace paddle {
 namespace operators {
 
 namespace {
-void CreateCPUTensor(framework::Scope* scope, const std::string& name,
-                     const std::vector<int64_t>& shape) {
+void CreateCUDATensor(framework::Scope* scope, const std::string& name,
+                      const std::vector<int64_t>& shape) {
   auto* var = scope->Var(name);
   auto* tensor = var->GetMutable<framework::LoDTensor>();
   auto dims = framework::make_ddim(shape);
   tensor->Resize(dims);
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
   inference::tensorrt::RandomizeTensor(tensor, place, ctx);
 }
 
@@ -57,6 +58,8 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
 using inference::analysis::SetAttr;
 
 TEST(TensorRTEngineOp, manual) {
+  FLAGS_tensorrt_engine_batch_size = 2;
+  FLAGS_tensorrt_max_batch_size = 2;
   framework::ProgramDesc program;
   auto* block_ = program.Proto()->add_blocks();
   block_->set_idx(0);
@@ -98,8 +101,6 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
-  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
   SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                     std::vector<std::string>({}));
@@ -112,15 +113,15 @@ TEST(TensorRTEngineOp, manual) {
   LOG(INFO) << "engine_op " << engine_op.get();
 
   framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
   // Prepare variables.
-  CreateCPUTensor(&scope, "x", std::vector<int64_t>({2, 4}));
-  CreateCPUTensor(&scope, "y", std::vector<int64_t>({4, 6}));
-  CreateCPUTensor(&scope, "z", std::vector<int64_t>({2, 6}));
+  CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
 
-  CreateCPUTensor(&scope, "y0", std::vector<int64_t>({6, 8}));
-  CreateCPUTensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+  CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
 
   // Execute them.
   LOG(INFO) << "engine_op run";
@@ -128,10 +129,12 @@ TEST(TensorRTEngineOp, manual) {
 }
 
 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  FLAGS_tensorrt_engine_batch_size = batch_size;
+  FLAGS_tensorrt_max_batch_size = batch_size;
   framework::ProgramDesc program;
   framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
 
   auto* block_ = program.Proto()->add_blocks();
   block_->set_idx(0);
@@ -165,10 +168,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
 
     // Prepare variables.
     if (!x_created) {
-      CreateCPUTensor(&scope, x_name, std::vector<int64_t>(x_shape));
+      CreateCUDATensor(&scope, x_name, std::vector<int64_t>(x_shape));
     }
-    CreateCPUTensor(&scope, y_name, std::vector<int64_t>(y_shape));
-    CreateCPUTensor(&scope, z_name, std::vector<int64_t>(z_shape));
+    CreateCUDATensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
 
     // It is wired, need to copy manually.
     *block_->add_ops() = *fc->Proto();
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 4a8ac441cfaf642fde58ee30865a22e83c065498..92a0697e27ba0da66fa3b0f5380e7bd52575640d 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -30,6 +30,8 @@ class TopkOp : public framework::OperatorWithKernel {
                    "Output(Indices) of TopkOp should not be null.");
 
     auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(input_dims.size(), 2,
+                      "Rank of TopK op's input must be 2.");
     const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
 
     PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 5fc0784f665f9f4a4422ca9b70f7dc6001833a8f..9da8551eb2d7ea66ad434c42b54522432095ce29 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -11,19 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <limits>
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using paddle::platform::float16;
 
 template <typename T>
 struct Pair {
@@ -35,11 +32,6 @@ struct Pair {
     id = id;
   }
 
-  __device__ __forceinline__ void clear() {
-    v = -INFINITY;
-    id = -1;
-  }
-
   __device__ __forceinline__ void operator=(const Pair<T>& in) {
     v = in.v;
     id = in.id;
@@ -61,12 +53,6 @@ struct Pair {
   int64_t id;
 };
 
-template <>
-__device__ __forceinline__ void Pair<float16>::clear() {
-  v = platform::raw_uint16_to_float16(0x400);
-  id = -1;
-}
-
 template <typename T>
 __device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
                                       int beam_size) {
@@ -164,7 +150,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - (*beam)) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].clear();
+          topk[k].set(-INFINITY, -1);
         }
       }
       if (!(*is_empty)) {
@@ -174,7 +160,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
     }
 
     *max = topk[MaxLength - 1];
-    if ((*max).v == static_cast<T>(-1)) *is_empty = true;
+    if ((*max).v == -1) *is_empty = true;
     *beam = 0;
   }
 }
@@ -195,7 +181,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - *beam) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].set(std::numeric_limits<T>::min(), -1);
+          topk[k].set(-INFINITY, -1);
         }
       }
       if (!(*is_empty)) {
@@ -287,7 +273,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
   bool firststep = true;
 
   for (int k = 0; k < MaxLength; k++) {
-    topk[k].clear();
+    topk[k].set(-INFINITY, -1);
   }
   while (k) {
     ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
@@ -339,7 +325,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    top_k, paddle::operators::TopkOpCUDAKernel<float>,
-    paddle::operators::TopkOpCUDAKernel<double>,
-    paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
+                        paddle::operators::TopkOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index edd1baa4ace4e246190afcd12b0716f1dd38e243..5248767c2eeb9388c26d203e64f8b2c68ffe0865 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -30,8 +30,10 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<framework::LoDTensor>();
     } else if (out_var->IsType<framework::SelectedRows>()) {
       auto shape = ctx.Attr<std::vector<int>>("shape");
-      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      auto* selected_rows = out_var->GetMutable<framework::SelectedRows>();
+      tensor = selected_rows->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
+      selected_rows->mutable_rows()->reserve(shape[0]);
     } else {
       PADDLE_THROW(
           "uniform_random_op's output only"
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 2b8039a0c1bea07402435958608ea035ba862c90..e1c7323a30233f4ec4f60e46aa6088ee6d8601b7 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -11,14 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <glog/logging.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
@@ -40,11 +36,6 @@ struct UniformGenerator {
   }
 };
 
-template <typename T, typename V>
-struct CastFunctor {
-  HOSTDEVICE V operator()(const T& a) { return static_cast<V>(a); }
-};
-
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -75,50 +66,18 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     T max = static_cast<T>(context.Attr<float>("max"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    if (out_var->IsType<framework::LoDTensor>() &&
-        std::type_index(typeid(T)) ==
-            std::type_index(typeid(platform::float16))) {
-      framework::Tensor master_copy_tensor;
-      master_copy_tensor.Resize(tensor->dims());
-      float* master_copy_tensor_data =
-          master_copy_tensor.mutable_data<float>(context.GetPlace());
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<float>(master_copy_tensor_data),
-                        UniformGenerator<float>(static_cast<float>(min),
-                                                static_cast<float>(max), seed));
-      platform::Transform<platform::CUDADeviceContext> trans;
-      auto* in_begin = master_copy_tensor.data<float>();
-      auto* in_end = in_begin + master_copy_tensor.numel();
-      auto* out_begin = tensor->mutable_data<T>(context.GetPlace());
-      trans(context.template device_context<platform::CUDADeviceContext>(),
-            in_begin, in_end, out_begin, CastFunctor<float, T>());
-    } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        UniformGenerator<T>(min, max, seed));
-    }
-    if (VLOG_IS_ON(5)) {
-      framework::Tensor cpu_tensor;
-      framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
-      auto& dev_ctx =
-          *platform::DeviceContextPool::Instance().Get(context.GetPlace());
-      dev_ctx.Wait();
-      auto x = framework::EigenVector<T>::Flatten(cpu_tensor);
-      VLOG(5) << "The Uniform output " << x;
-    }
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    uniform_random, paddle::operators::GPUUniformRandomKernel<float>,
-    paddle::operators::GPUUniformRandomKernel<double>,
-    paddle::operators::GPUUniformRandomKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    uniform_random_batch_size_like,
-    paddle::operators::GPUUniformRandomKernel<float>,
-    paddle::operators::GPUUniformRandomKernel<double>,
-    paddle::operators::GPUUniformRandomKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(uniform_random,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index f2a15fdf572e0de30f9949dda5020e130b0c5585..0fc8d54f6400c9dfb6af1e764ed44e95195bfe6e 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -23,9 +23,9 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of UnsqueezeOp should not be null.");
+                   "Input(X) of Unsqueeze operator should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of UnsqueezeOp should not be null.");
+                   "Output(Out) of Unsqueeze operator should not be null.");
 
     const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
     const auto &x_dims = ctx->GetInputDim("X");
@@ -95,7 +95,6 @@ class UnsqueezeOp : public framework::OperatorBase {
 
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(out_dims);
-    attrs["inplace"] = Attr<bool>("inplace");
     // Invoke Reshape op.
     auto reshape_op = framework::OpRegistry::CreateOp(
         "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
@@ -126,13 +125,6 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
                            " within [1, 6] dimensions (Eigen limit).");
           }
         });
-    AddAttr<bool>(
-        "inplace",
-        "(default: false) Unsqueeze the source tensor's shape without "
-        "memory copy. When Attr(inplace) is set true, the output "
-        "tensor shares memory with Input(X), otherwise, a new output "
-        "tensor is created, and its data are copied from Input(x).")
-        .SetDefault(false);
     AddComment(R"DOC(
     Unsqueeze Operator.
     
@@ -168,7 +160,6 @@ class UnsqueezeGradOp : public framework::OperatorBase {
 
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(x_dims);
-    attrs["inplace"] = Attr<bool>("inplace");
 
     auto reshape_op = framework::OpRegistry::CreateOp(
         "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 733157ea05ed39434b9a750e3a94ea548f512ce6..65a3bc928e47ac60f06e7efc75f42703e45acbb4 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -57,12 +57,16 @@ class WhileOp : public framework::OperatorBase {
 
     PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
                    "Condition of while op must in CPU memory.");
+
+    bool is_test = Attr<bool>("is_test");
+    auto ctx = executor.Prepare(*program, block->ID());
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
-
-      executor.Run(*program, &current_scope, block->ID(),
-                   false /*create_local_scope*/);
+      executor.RunPreparedContext(ctx.get(), &current_scope, false);
+      if (is_test) {
+        scope.DeleteScope(&current_scope);
+      }
     }
   }
 };
@@ -88,6 +92,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
               "variables generated in the i'th step.");
     AddAttr<framework::BlockDesc *>(kStepBlock,
                                     "The step block inside WhileOp");
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
     AddComment(R"DOC(
 )DOC");
   }
@@ -103,12 +108,15 @@ class WhileGradOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
+    PADDLE_ENFORCE(!Attr<bool>("is_test"),
+                   "GradOp is only callable when is_test is false");
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
+    auto ctx = executor.Prepare(*program, block->ID());
 
     auto *step_scopes =
         scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
@@ -161,8 +169,7 @@ class WhileGradOp : public framework::OperatorBase {
           }
         }
       }
-
-      executor.Run(*program, *cur_scope_iter, block->ID(), false);
+      executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false);
 
       auto &pg_names = Outputs(kXGRAD);
       auto &p_names = Inputs(kX);
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 7d53a684d6068c79659719159696ef5aebfeaa2b..fcd658d67cf4551dbdb9696ef49b5ab3cc58bf95 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -103,15 +103,16 @@ size_t CUDAPinnedMaxChunkSize() {
   return CUDAPinnedMaxAllocSize() / 256;
 }
 
-#ifdef PADDLE_WITH_XBYAK
 namespace jit {
-
+#ifdef PADDLE_WITH_XBYAK
 static Xbyak::util::Cpu cpu;
 bool MayIUse(const cpu_isa_t cpu_isa) {
   using namespace Xbyak::util;  // NOLINT
   switch (cpu_isa) {
     case sse42:
       return cpu.has(Cpu::tSSE42);
+    case avx:
+      return cpu.has(Cpu::tAVX);
     case avx2:
       return cpu.has(Cpu::tAVX2);
     case avx512_common:
@@ -134,8 +135,16 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
   }
   return false;
 }
+#else
+bool MayIUse(const cpu_isa_t cpu_isa) {
+  if (cpu_isa == isa_any) {
+    return true;
+  } else {
+    return false;
+  }
+}
+#endif
 
 }  // namespace jit
-#endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index f5f67667594f1ab80058533e4c5d5b04c2592b60..5d17978dd7946596c490dc465dab51e7cf53a044 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -37,12 +37,11 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();
 
-#ifdef PADDLE_WITH_XBYAK
 namespace jit {
-
 typedef enum {
   isa_any,
   sse42,
+  avx,
   avx2,
   avx512_common,
   avx512_core,
@@ -55,7 +54,6 @@ typedef enum {
 inline bool MayIUse(const cpu_isa_t cpu_isa);
 
 }  // namespace jit
-#endif
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index 23457ff5fe1ec27094113ba0dde26adc64c716b5..9f504d14a8da116648483c0f64cb511b46e6a97e 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -36,7 +36,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 #if CUDA_VERSION < 9000
   return __shfl_down(val, delta, width);
 #else
-  return __shfl_down_sync(mask, val, delta, width);
+  return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
 #endif
 }
 
@@ -46,9 +46,16 @@ template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
                                                        int width) {
-  half tmp = static_cast<half>(val);
-  __shfl_down(tmp, static_cast<unsigned>(delta), width);
-  return float16(tmp);
+  return float16(
+      __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
+}
+#else
+template <>
+__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
+                                                       float16 val, int delta,
+                                                       int width) {
+  return float16(__shfl_down_sync(mask, static_cast<half>(val),
+                                  static_cast<unsigned>(delta), width));
 }
 #endif
 
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
index ca5ca1caeb23f01c047feeccf9c276b2dcd1cb68..ee45afab93d079374aefe366425502890854c28d 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+#include <algorithm>
 #include <iostream>
 #include <random>
 
@@ -123,7 +124,7 @@ void TestUnalign(size_t num, const int shift_bit) {
   cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
   cudaDeviceSynchronize();
   for (size_t i = 0; i < num / 2; ++i) {
-    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // NOTE(dzhwinter): the float16 add has small truncate error.
     // so we use EXPECT_NEAR to check the result.
     EXPECT_NEAR(static_cast<float>(out[i]),
                 static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
@@ -151,3 +152,83 @@ TEST(CudaAtomic, float16Unalign) {
   TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
   TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
 }
+
+// https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
+template <typename T>
+static __forceinline__ __device__ T WarpReduceSum(T val) {
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+
+template <typename T>
+__forceinline__ __device__ T BlockReduce(T val) {
+  static __shared__ T shared[32];  // Shared mem for 32 partial sums
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+
+  val = WarpReduceSum(val);  // Each warp performs partial reduction
+
+  if (lane == 0) shared[wid] = val;  // Write reduced value to shared memory
+
+  __syncthreads();  // Wait for all partial reductions
+
+  // read from shared memory only if that warp existed
+  val =
+      (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast<T>(0);
+
+  if (wid == 0) val = WarpReduceSum(val);  // Final reduce within first warp
+
+  return val;
+}
+
+template <typename T>
+__global__ void DeviceReduceSum(T* in, T* out, size_t N) {
+  T sum(0);
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    sum += in[i];
+  }
+  sum = BlockReduce<T>(sum);
+  __syncthreads();
+  if (threadIdx.x == 0) out[blockIdx.x] = sum;
+}
+
+template <typename T>
+void TestReduce(size_t num, float atol = 0.01) {
+  T* in1;
+  T *d_in1, *d_in2;
+  size_t size = sizeof(T) * num;
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
+  in1 = reinterpret_cast<T*>(malloc(size));
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num; ++i) {
+    in1[i] = static_cast<T>(dist(engine));
+  }
+  auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+  DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
+  cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  // NOTE(dzhwinter): the float16 add has small underflow/overflow
+  // so we use EXPECT_NEAR to check the result.
+  EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
+  free(in1);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+TEST(CudaShuffleSync, float16) {
+  TestReduce<float>(10);
+  TestReduce<float>(1000);
+
+  // float16 will overflow or accumulate truncate errors in big size.
+  TestReduce<float16>(10);
+  TestReduce<float16>(100, /*atol error*/ 1.0);
+}
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 9da787a4073fa002f75154f7c4fba54e9ed8efa6..07159d4a12ef4b628f7705ed206d3334be46dfc8 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
 
 # There is no macOS version of NCCL.
-if (NOT APPLE)
+if (NOT APPLE AND NOT WIN32)
   list(APPEND CUDA_SRCS nccl.cc)
 endif()
 
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 25bcda7eedc1ef42f75fb8fd1439f0c8f55015c3..c7c533bd42859c374c4783d43ec4cdd34a6a994a 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -17,10 +17,10 @@
 #include <cublasXt.h>
 #include <cublas_v2.h>
 #include <cuda.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
 #include <type_traits>
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 77e46fa768b62c277d7b4027de7173e39a5672b4..0103e7a3accf88f3c83f109298010c3c9af3d549 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <cudnn.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index e8f4a82ef132be9e4ec3fb76f11766046a2ff638..b946f46e82af4b09fafff54765b899254a4ec1df 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -17,10 +17,10 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cupti.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 5b9e0820e0b319fe7a636a57a0029caf038b4db3..2daf1b4215ce1f7f771bbac72bfe103b0b941976 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -14,9 +14,9 @@ limitations under the License. */
 #pragma once
 
 #include <curand.h>
-#include <dlfcn.h>
 
 #include <mutex>  // NOLINT
+#include "paddle/fluid/platform/port.h"
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 9e7a616094e184695de521aa035257bde4170a91..aa20553ceffceded09447693c6e92f55fb48702d 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include <dlfcn.h>
 #include <mkl.h>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
@@ -49,25 +49,33 @@ extern void* mklml_dso_handle;
 
 #define MKLML_ROUTINE_EACH(__macro) \
   __macro(cblas_sgemm);             \
-  __macro(cblas_saxpy);             \
-  __macro(cblas_scopy);             \
-  __macro(cblas_sgemv);             \
-  __macro(cblas_sgemm_batch);       \
   __macro(cblas_dgemm);             \
+  __macro(cblas_saxpy);             \
   __macro(cblas_daxpy);             \
+  __macro(cblas_scopy);             \
   __macro(cblas_dcopy);             \
+  __macro(cblas_sgemv);             \
   __macro(cblas_dgemv);             \
-  __macro(cblas_dgemm_batch);       \
-  __macro(vsAdd);                   \
-  __macro(vdAdd);                   \
   __macro(cblas_sgemm_alloc);       \
-  __macro(cblas_sgemm_pack);        \
-  __macro(cblas_sgemm_compute);     \
-  __macro(cblas_sgemm_free);        \
   __macro(cblas_dgemm_alloc);       \
+  __macro(cblas_sgemm_pack);        \
   __macro(cblas_dgemm_pack);        \
+  __macro(cblas_sgemm_compute);     \
   __macro(cblas_dgemm_compute);     \
+  __macro(cblas_sgemm_free);        \
   __macro(cblas_dgemm_free);        \
+  __macro(cblas_sgemm_batch);       \
+  __macro(cblas_dgemm_batch);       \
+  __macro(cblas_sdot);              \
+  __macro(cblas_ddot);              \
+  __macro(cblas_sscal);             \
+  __macro(cblas_dscal);             \
+  __macro(vsAdd);                   \
+  __macro(vdAdd);                   \
+  __macro(vsMul);                   \
+  __macro(vdMul);                   \
+  __macro(vsExp);                   \
+  __macro(vdExp);                   \
   __macro(MKL_Set_Num_Threads)
 
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index 575516f81870fc9f7b92919ffc20a201cb5cbce8..331ca9908e126d5dbca830457281fbf88fc1df09 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -13,12 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#include <dlfcn.h>
 #include <nccl.h>
 
 #include <mutex>  // NOLINT
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index d157c1fda789b98f06ad069d2a9c4f421ff82dcd..18ed9956f1841874b27c2493e2f3e22fdfbf0448 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 #include "warpctc/include/ctc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 81b5359b40589d898bda0dfa71afb6f51385354b..a76ba75f9eeb8c3f42fbf7254f629b0960a8f2d8 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -14,9 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <dlfcn.h>     // for dladdr
-#include <execinfo.h>  // for backtrace
-
 #ifdef __GNUC__
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__
@@ -37,6 +34,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
 
@@ -44,7 +42,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
-#ifndef __APPLE__
+#if !defined(__APPLE__) and !defined(_WIN32)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
@@ -75,7 +73,7 @@ struct EnforceNotMet : public std::exception {
 
       sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
       sout << "PaddlePaddle Call Stacks: " << std::endl;
-
+#if !defined(_WIN32)
       void* call_stack[TRACE_STACK_LIMIT];
       auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
       auto symbols = backtrace_symbols(call_stack, size);
@@ -95,6 +93,9 @@ struct EnforceNotMet : public std::exception {
         }
       }
       free(symbols);
+#else
+      sout << "Windows not support stack backtrace yet.";
+#endif
       err_str_ = sout.str();
     }
   }
@@ -205,7 +206,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 #endif
 }
 
-#ifndef __APPLE__
+#if !defined(__APPLE__) and !defined(_WIN32)
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     ncclResult_t stat, const Args&... args) {
@@ -221,7 +222,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 #endif
   }
 }
-#endif  // __APPLE__
+#endif  // __APPLE__ and windows
 #endif  // PADDLE_WITH_CUDA
 
 template <typename T>
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 10a3ad256b17ba41380cdc0377905d03188cbaa3..f6e9a52b275353c03c1f350719766922a97f6cb3 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -125,6 +125,11 @@ class MKLDNNHandler {
     return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
   }
 
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(
       const mkldnn::memory::desc& md, void* ptr) {
     return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0a2d29500e7afbe8a9a43f010d5fd2d0c560467
--- /dev/null
+++ b/paddle/fluid/platform/port.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+#if !defined(_WIN32)
+#include <dlfcn.h>     // for dladdr
+#include <execinfo.h>  // for backtrace
+#else
+#include <Shlwapi.h>
+#include <Windows.h>
+
+static void* dlsym(void* handle, const char* symbol_name) {
+  FARPROC found_symbol;
+  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
+
+  if (found_symbol == NULL) {
+    throw std::runtime_error(std::string(symbol_name) + " not found.");
+  }
+  return reinterpret_cast<void*>(found_symbol);
+}
+
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 89ca4f781273e99bbb83216c238dfc5c88c0a22b..d6a14b3305c5cf2d544f17f39a3812f7f75b8a76 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,19 +1,22 @@
+set(PYBIND_DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
+          )
+if(NOT WIN32)
+list(APPEND PYBIND_DEPS parallel_executor)
+endif()
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
-           parallel_executor
+      DEPS ${PYBIND_DEPS}
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
-           parallel_executor
+      DEPS ${PYBIND_DEPS}
       ${GLOB_OP_LIB})
-    if(NOT APPLE AND NOT ANDROID)
+    if(NOT APPLE AND NOT ANDROID AND NOT WIN32)
       target_link_libraries(paddle_pybind rt)
-    endif(NOT APPLE AND NOT ANDROID)
+    endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
   endif(WITH_AMD_GPU)
 
   cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 76aa7d2010682416f68e982e9b89da9813abb078..a81715c3b317ac47d1aefd1752ef5b434a15494a 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/const_value.h"
-#include <paddle/fluid/framework/op_proto_maker.h>
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
@@ -24,6 +25,8 @@ void BindConstValue(pybind11::module* m) {
   m->def("kTempVarName", [] { return framework::kTempVarName; });
   m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
   m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+  m->def("kControlDepVarName",
+         [] { return framework::ir::Node::kControlDepVarName; });
 
   auto op_proto_and_checker_maker =
       m->def_submodule("op_proto_and_checker_maker");
@@ -40,6 +43,9 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpRoleVarAttrName",
       framework::OpProtoAndCheckerMaker::OpRoleVarAttrName);
+  op_proto_and_checker_maker.def(
+      "kOpCreationCallstackAttrName",
+      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 40ced8e1c78e24f8df2a046fd95de1d196ff3085..67734659233515ca8110f4212a2b1553fe4e9d24 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -249,6 +249,7 @@ PYBIND11_PLUGIN(core) {
         self.set_rows(new_rows);
 #endif
            })
+      .def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); })
       .def("rows", [](SelectedRows &self) {
         auto rows = self.rows();
         std::vector<int64_t> new_rows;
@@ -595,8 +596,8 @@ All parameter, weight, gradient are variables in Paddle.
 
   // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy>(pe, "ExecutionStrategy")
-      .def(py::init())
+  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
+  exec_strategy.def(py::init())
       .def_property(
           "num_threads",
           [](const ExecutionStrategy &self) { return self.num_threads_; },
@@ -623,6 +624,16 @@ All parameter, weight, gradient are variables in Paddle.
           [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
             self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
           });
+  exec_strategy.def_property(
+      "use_experimental_executor",
+      [](const ExecutionStrategy &self) {
+        return self.type_ == ExecutionStrategy::kExperimental;
+      },
+      [](ExecutionStrategy &self, bool experimental) {
+        self.type_ = experimental ? ExecutionStrategy::kExperimental
+                                  : ExecutionStrategy::kDefault;
+      });
+
   py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");
 
   py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8460f93b841fe136db138e0dc7576f3aacdbeb5f..f2a9a6b3b9af59e5c4709eb822fa5d9ab1543a0c 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -116,7 +116,6 @@ function cmake_gen() {
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
         -DPY_VERSION=${PY_VERSION:-2.7}
     ========================================
 EOF
@@ -146,7 +145,6 @@ EOF
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} \
         -DPY_VERSION=${PY_VERSION:-2.7}
 }
 
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 1283de9d957a46b848c7bb6caf9c5f49398468e2..622a2d51049d164b6e8423e4054081f40f190cb9 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -54,7 +54,7 @@ function cpu_config() {
   if [ $platform == "Linux" ]; then
     ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
   elif [ $platform == "Darwin" ]; then
-    if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then
+    if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
       # HT is OFF
       ht=1
     fi
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 3038747bf8d03cf82613f1b329b7eeb778a0b9f2..38addd0cfd9bd0afde7eefc57f2111b717b7e636 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -24,7 +24,6 @@ import paddle.dataset.common
 import subprocess
 import numpy
 import platform
-import six
 import tempfile
 from six.moves import range
 __all__ = ['train', 'test', 'convert']
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 2377ac5f929eb21449689240da3061152a0541f9..e0ddd3b5ffecfd5c49b2da7dfa739e2c3e93838f 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -18,6 +18,7 @@ import collections
 import contextlib
 import re
 import six
+import traceback
 
 import numpy as np
 
@@ -49,6 +50,12 @@ EMPTY_VAR_NAME = core.kEmptyVarName()
 TEMP_VAR_NAME = core.kTempVarName()
 GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
+CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
+
+
+def generate_control_dev_var_name():
+    import random
+    return CONTROL_DEP_VAR_PREFIX + "@" + str(random.random())
 
 
 def grad_var_name(var_name):
@@ -499,6 +506,10 @@ class Operator(object):
         if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
             del op_attrs[role_var_name]
 
+        callstack_var_name = op_maker.kOpCreationCallstackAttrName()
+        op_attrs[callstack_var_name] = list(
+            reversed(traceback.format_stack()))[1:]
+
         if len(self.desc.type()) != 0:
             return
         if type is None:
@@ -1363,6 +1374,13 @@ class Program(object):
         self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
         self._op_role_var = []
 
+        # for distribute
+        self._is_distributed = False
+        self._is_chief = False
+        self._slice_vars_and_attrs = []
+        self._endpoints = []
+        self._distributed_lookup_table = None
+
     @property
     def op_role(self):
         """
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 6b67128fbfdb10d1abde4ebe7d663a2685bff109..5c4ec99c533829240ac0bcc9647acb870f3412f8 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -372,6 +372,7 @@ def load_vars(executor,
         load_vars(
             executor,
             dirname=dirname,
+            main_program=main_program,
             vars=list(filter(predicate, main_program.list_vars())),
             filename=filename)
     else:
@@ -403,9 +404,15 @@ def load_vars(executor,
                 inputs={},
                 outputs={"Out": load_var_list},
                 attrs={'file_path': os.path.join(dirname, filename)})
-
         executor.run(load_prog)
 
+        if main_program is None:
+            main_program = default_main_program()
+
+        # load slice vars on pserver, if have it.
+        _load_slice_up_vars(executor, dirname,
+                            main_program._slice_vars_and_attrs)
+
 
 def load_params(executor, dirname, main_program=None, filename=None):
     """
@@ -659,11 +666,19 @@ def save_inference_model(dirname,
 
     save_persistables(executor, dirname, inference_program, params_filename)
 
+    # if there is lookup table, the trainer 0 will notify all pserver to save.
+    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
+        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+        _save_lookup_tables_by_notify(executor, lookup_table_filename,
+                                      main_program._distributed_lookup_table,
+                                      main_program._endpoints)
+
 
 def load_inference_model(dirname,
                          executor,
                          model_filename=None,
-                         params_filename=None):
+                         params_filename=None,
+                         pserver_endpoints=None):
     """
     Load inference model from a directory
 
@@ -679,6 +694,10 @@ def load_inference_model(dirname,
                                    parameters were saved in a single binary
                                    file. If parameters were saved in separate
                                    files, set it as 'None'.
+        pserver_endpoints(list|None): This only need by distributed inference.
+                                    When use distributed look up table in training,
+                                    We also need it in inference.The parameter is
+                                    a list of pserver endpoints.
 
     Returns:
         tuple: The return of this function is a tuple with three elements:
@@ -697,12 +716,16 @@ def load_inference_model(dirname,
 
             exe = fluid.Executor(fluid.CPUPlace())
             path = "./infer_model"
+            endpoints = ["127.0.0.1:2023","127.0.0.1:2024"]
             [inference_program, feed_target_names, fetch_targets] =
                 fluid.io.load_inference_model(dirname=path, executor=exe)
             results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
 
+            # if we need lookup table, we will use:
+            fluid.io.load_inference_model(dirname=path, executor=exe, pserver_endpoints=endpoints)
+
             # In this exsample, the inference program was saved in the
             # "./infer_model/__model__" and parameters were saved in
             # separate files in ""./infer_model".
@@ -729,6 +752,9 @@ def load_inference_model(dirname,
     program = Program.parse_from_string(program_desc_str)
     load_persistables(executor, dirname, program, params_filename)
 
+    if pserver_endpoints:
+        program = _endpoints_replacement(program, pserver_endpoints)
+
     feed_target_names = program.desc.get_feed_target_names()
     fetch_target_names = program.desc.get_fetch_target_names()
     fetch_targets = [
@@ -738,6 +764,61 @@ def load_inference_model(dirname,
     return [program, feed_target_names, fetch_targets]
 
 
+def _save_lookup_tables_by_notify(executor, dirname, lookup_table,
+                                  pserver_endpoints):
+    """
+    This function will send checkpoint notify message from Trainer 0
+    to all the pservers.
+    The checkpoint notify message contains lookup table name,
+    the absolute path on pserver to save lookup_table.
+
+    Args:
+        executor(Executor): The executor to run for send checkpoint notify.
+        dirname(str): The folder where to save.
+        lookup_table(string): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name
+        ps_endpoint_list(list): the parameter server ip:port list.
+            when use distribute lookup table, we can get ps_endpoint_list by
+            distribute arguments.
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            _save_pserver_vars_by_notify(executor=exe,
+                    dirname=param_path, lookup_table=table_name,
+                    pserver_endpoints=ps_endpoints)
+    """
+
+    pserver_notify_program = Program()
+    pserver_notify_block = pserver_notify_program.global_block()
+
+    attrs = {}
+    attrs['epmap'] = pserver_endpoints
+    attrs['dir'] = dirname
+    attrs['lookup_table'] = lookup_table
+
+    pserver_notify_block.append_op(
+        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+    executor.run(pserver_notify_program)
+
+
+def _endpoints_replacement(program, endpoints):
+    ENDPOINT_MAP = "epmap"
+    for op in program.global_block().ops:
+        if op.has_attr(ENDPOINT_MAP):
+            op.set_attr(ENDPOINT_MAP, endpoints)
+    program._sync_with_cpp()
+    return program
+
+
 def get_parameter_value(para, executor):
     """
     Get the LoDTensor value of the given parameter.
@@ -799,3 +880,46 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
+
+
+def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
+    if not slice_vars_and_attrs:
+        return
+
+    load_prog = Program()
+    load_block = load_prog.global_block()
+
+    for var_tuple in slice_vars_and_attrs:
+        orig_var = var_tuple[0]
+        start = var_tuple[1]
+        slice_var = var_tuple[2]
+        end = start + reduce(lambda x, y: x * y, slice_var.shape)
+
+        clone_orig_var = load_block.create_var(
+            name=orig_var.name,
+            type=orig_var.type,
+            shape=orig_var.shape,
+            dtype=orig_var.dtype,
+            persistable=True)
+
+        clone_slice_var = load_block.create_var(
+            name=slice_var.name,
+            type=slice_var.type,
+            shape=slice_var.shape,
+            dtype=slice_var.dtype,
+            persistable=True)
+
+        load_block.append_op(
+            type='load',
+            inputs={},
+            outputs={'Out': [clone_orig_var]},
+            attrs={'file_path': os.path.join(dirname, clone_orig_var.name)})
+        load_block.append_op(
+            type="slice",
+            inputs={'Input': clone_orig_var},
+            outputs={'Out': clone_slice_var},
+            attrs={'axes': [0],
+                   'starts': [start],
+                   'ends': [end]})
+
+    executor.run(load_prog)
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 173567a0a374ac4453025b67b047950936df2055..d2954c4c22069627bac5188cf1d485e50b68c8e4 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -661,6 +661,7 @@ class While(object):
 
     Args:
         cond (Variable): condition used to compare.
+        is_test(bool): A flag indicating whether execution is in test phase.
         name (str): The name of this layer.
 
     Examples:
@@ -683,7 +684,7 @@ class While(object):
     IN_WHILE_BLOCK = 1
     AFTER_WHILE_BLOCK = 2
 
-    def __init__(self, cond, name=None):
+    def __init__(self, cond, is_test=False, name=None):
         self.helper = LayerHelper("while", name=name)
         self.status = While.BEFORE_WHILE_BLOCK
         if not isinstance(cond, Variable):
@@ -694,6 +695,7 @@ class While(object):
         if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
             raise TypeError("condition should be a bool scalar")
         self.cond_var = cond
+        self.is_test = is_test
 
     def block(self):
         return WhileGuard(self)
@@ -735,7 +737,8 @@ class While(object):
             },
             outputs={'Out': out_vars,
                      'StepScopes': [step_scope]},
-            attrs={'sub_block': while_block})
+            attrs={'sub_block': while_block,
+                   "is_test": self.is_test})
 
 
 def lod_rank_table(x, level=0):
@@ -1272,8 +1275,8 @@ class ConditionalBlock(object):
         parent_block.append_op(
             type='conditional_block',
             inputs={
-                'X': self.inputs,
-                'Params': param_list,
+                'Cond': self.inputs,
+                'Input': param_list,
             },
             outputs={'Out': out_list,
                      'Scope': [step_scope]},
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 21a295a0982cbc51947a063beee542c13494024d..b03ee514f50f9a8c1425bd5b1d409b58ed62351a 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -24,7 +24,7 @@ from .layer_function_generator import templatedoc
 from .. import core
 from ..executor import global_scope
 from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
-    default_startup_program, program_guard, Program
+    default_startup_program, program_guard, Program, Variable
 from ..layer_helper import LayerHelper
 from ..unique_name import generate as unique_name
 
@@ -209,7 +209,7 @@ class ListenAndServ(object):
             })
 
 
-def Send(endpoints, send_vars, sync=True):
+def Send(endpoints, send_vars, dummy_output=None, sync=True):
     """
     Send variables to the server side, and get vars from server
     side when server have finished running server side program.
@@ -223,6 +223,13 @@ def Send(endpoints, send_vars, sync=True):
     """
     assert (type(send_vars) == list)
 
+    if dummy_output is None:
+        dummy_output = []
+    elif isinstance(dummy_output, Variable):
+        dummy_output = [dummy_output]
+
+    assert (type(dummy_output) == list)
+
     epmap = endpoints.split(",")
     endpoints = list(set(epmap))
 
@@ -232,6 +239,7 @@ def Send(endpoints, send_vars, sync=True):
     helper.append_op(
         type="send",
         inputs={"X": send_vars},
+        outputs={"Out": dummy_output},
         attrs={
             "endpoints": endpoints,
             "epmap": epmap,
@@ -241,7 +249,7 @@ def Send(endpoints, send_vars, sync=True):
         helper.append_op(type="send_barrier", attrs={"endpoints": endpoints})
 
 
-def Recv(endpoints, get_vars, sync=True):
+def Recv(endpoints, get_vars, dummy_input=None, sync=True):
     """
     Receive variables from server side
 
@@ -256,13 +264,20 @@ def Recv(endpoints, get_vars, sync=True):
     """
     assert (type(get_vars) == list)
 
+    if dummy_input is None:
+        dummy_input = []
+    elif isinstance(dummy_input, Variable):
+        dummy_input = [dummy_input]
+
+    assert (type(dummy_input) == list)
+
     epmap = endpoints.split(",")
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Recv", **locals())
     helper.append_op(
         type="recv",
-        inputs={"X": get_vars},
+        inputs={"X": dummy_input},
         outputs={"Out": get_vars},
         attrs={"endpoints": endpoints,
                "epmap": epmap})
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d4efb682d95d8ff6e9badb074f73968617dc51a2..c8e4c99f9eaf4cd7ffc595457482a63326c13dfe 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -27,24 +27,85 @@ from . import utils
 import random
 from .. import unique_name
 from functools import reduce
+import warnings
 
 __all__ = [
-    'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
-    'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
-    'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', 'conv3d',
-    'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'pool3d',
-    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'conv3d_transpose',
-    'sequence_expand', 'lstm_unit', 'reduce_sum', 'reduce_mean', 'reduce_max',
-    'reduce_min', 'reduce_prod', 'sequence_first_step', 'sequence_last_step',
-    'dropout', 'split', 'ctc_greedy_decoder', 'edit_distance', 'l2_normalize',
-    'matmul', 'topk', 'warpctc', 'sequence_reshape', 'transpose', 'im2sequence',
-    'nce', 'hsigmoid', 'beam_search', 'row_conv', 'multiplex', 'layer_norm',
-    'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
-    'autoincreased_step_counter', 'reshape', 'lod_reset', 'lrn', 'pad',
-    'label_smooth', 'roi_pool', 'dice_loss', 'image_resize',
-    'image_resize_short', 'resize_bilinear', 'gather', 'random_crop',
-    'mean_iou', 'relu', 'log', 'crop', 'rank_loss', 'prelu', 'flatten',
-    'sequence_enumerate'
+    'fc',
+    'embedding',
+    'dynamic_lstm',
+    'dynamic_lstmp',
+    'dynamic_gru',
+    'gru_unit',
+    'linear_chain_crf',
+    'crf_decoding',
+    'cos_sim',
+    'cross_entropy',
+    'square_error_cost',
+    'chunk_eval',
+    'sequence_conv',
+    'conv2d',
+    'conv3d',
+    'sequence_pool',
+    'sequence_softmax',
+    'softmax',
+    'pool2d',
+    'pool3d',
+    'batch_norm',
+    'beam_search_decode',
+    'conv2d_transpose',
+    'conv3d_transpose',
+    'sequence_expand',
+    'lstm_unit',
+    'reduce_sum',
+    'reduce_mean',
+    'reduce_max',
+    'reduce_min',
+    'reduce_prod',
+    'sequence_first_step',
+    'sequence_last_step',
+    'dropout',
+    'split',
+    'ctc_greedy_decoder',
+    'edit_distance',
+    'l2_normalize',
+    'matmul',
+    'topk',
+    'warpctc',
+    'sequence_reshape',
+    'transpose',
+    'im2sequence',
+    'nce',
+    'hsigmoid',
+    'beam_search',
+    'row_conv',
+    'multiplex',
+    'layer_norm',
+    'softmax_with_cross_entropy',
+    'smooth_l1',
+    'one_hot',
+    'autoincreased_step_counter',
+    'reshape',
+    'lod_reset',
+    'lrn',
+    'pad',
+    'label_smooth',
+    'roi_pool',
+    'dice_loss',
+    'image_resize',
+    'image_resize_short',
+    'resize_bilinear',
+    'gather',
+    'scatter',
+    'random_crop',
+    'mean_iou',
+    'relu',
+    'log',
+    'crop',
+    'rank_loss',
+    'prelu',
+    'flatten',
+    'stack',
+    'sequence_enumerate',
 ]
 
 
@@ -1988,7 +2049,7 @@ def batch_norm(input,
         param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
         bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
         data_layout(string, default NCHW): NCHW|NHWC
-        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        in_place(bool, Default False): This argument is deprecated since 0.15.0.
         use_mkldnn(bool, Default false): ${use_mkldnn_comment}
         name(string, Default None): A name for this layer(optional). If set None, the layer
             will be named automatically.
@@ -2010,6 +2071,10 @@ def batch_norm(input,
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
 
+    if in_place:
+        raise warnings.warn("The argument in_place is deprecated since 0.15.0, "
+                            "please do not set it True.")
+
     input_shape = input.shape
     if data_layout == 'NCHW':
         channel_num = input_shape[1]
@@ -2059,7 +2124,7 @@ def batch_norm(input,
     saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
     saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
 
-    batch_norm_out = input if in_place else helper.create_tmp_variable(dtype)
+    batch_norm_out = helper.create_tmp_variable(dtype)
 
     helper.append_op(
         type="batch_norm",
@@ -4979,6 +5044,47 @@ def gather(input, index):
     return out
 
 
+def scatter(input, index, updates, name=None):
+    """
+    **Scatter Layer**
+
+    Output is obtained by updating the input on selected indices on the first
+    axis.
+
+    .. math::
+
+        Out = X
+        Out[Ids] = Updates
+
+    Args:
+        input (Variable): The source input with rank>=1.
+        index (Variable): The index input with rank=1. Its dtype should be
+                          int32 or int64 as it is used as indexes.
+        updates (Variable): The updated value of scatter op.
+        name (str|None): The output variable name. Default None.
+
+    Returns:
+        output (Variable): The output is a tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.scatter(input, index, updates)
+
+    """
+    helper = LayerHelper('scatter', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="scatter",
+        inputs={"X": input,
+                "Ids": index,
+                "Updates": updates},
+        outputs={"Out": out})
+    return out
+
+
 @templatedoc()
 def random_crop(x, shape, seed=None):
     """
@@ -5464,4 +5570,18 @@ def sequence_enumerate(input, win_size, pad_value, name=None):
         outputs={'Out': out},
         attrs={'win_size': win_size,
                'pad_value': pad_value})
+
+    
+def stack(x, axis=0):
+    helper = LayerHelper('stack', **locals())
+    axis = 0 if axis is None else axis
+
+    if not isinstance(x, list) and not isinstance(x, tuple):
+        x = [x]
+
+    out = helper.create_tmp_variable(x[0].dtype)
+    helper.append_op(
+        type='stack', inputs={'X': x}, outputs={'Y': out},
+        attrs={'axis': axis})
+
     return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index cc4a7de163ac2e52e43131a021160051423e040c..7cd62efda8900c830f43d882a41ab03184ebe594 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -65,7 +65,6 @@ __all__ = [
     'uniform_random_batch_size_like',
     'gaussian_random',
     'gaussian_random_batch_size_like',
-    'scatter',
     'sum',
     'slice',
     'shape',
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 051fe84364639ca6028326c0cb02b204a02531af..01563cbbb706d9a1c9c9d46ded71f7f48b5a9f04 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -229,7 +229,7 @@ def img_conv_group(input,
             use_mkldnn=use_mkldnn)
 
         if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
+            tmp = layers.batch_norm(input=tmp, act=conv_act)
             drop_rate = conv_batchnorm_drop_rate[i]
             if abs(drop_rate) > 1e-5:
                 tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 9fe361425c128590da910128beaccb3336f8ba57..cd1e8cd682315ef4931e323536a57542f4b3bc26 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -256,7 +256,10 @@ def main(net_type, use_cuda, is_local=True):
     save_dirname = "image_classification_" + net_type + ".inference.model"
 
     train(net_type, use_cuda, save_dirname, is_local)
-    infer(use_cuda, save_dirname)
+
+    # There is bug in fluid.InferenceTranspiler for VGG.
+    if net_type == "resnet":
+        infer(use_cuda, save_dirname)
 
 
 class TestImageClassification(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
index 10918a985f80f9f20629180b0359ce73f448d33b..61d81f483636a99ea9e0282de89f12e47f3b824c 100644
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -30,7 +30,8 @@ import numpy as np
 
 
 class TestMNISTIfElseOp(unittest.TestCase):
-    def test_raw_api(self):
+    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
+    def not_test_raw_api(self):
         prog = Program()
         startup_prog = Program()
         with program_guard(prog, startup_prog):
@@ -91,7 +92,8 @@ class TestMNISTIfElseOp(unittest.TestCase):
                     return
         self.assertFalse(True)
 
-    def test_ifelse(self):
+    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
+    def not_test_ifelse(self):
         prog = Program()
         startup_prog = Program()
         with program_guard(prog, startup_prog):
@@ -153,6 +155,13 @@ class TestIfElse(unittest.TestCase):
         self.cond_value = 0.5
         self.data = np.random.rand(25, 1).astype(np.float32)
 
+    def numpy_cal(self):
+        s1 = self.data[np.where(self.data < self.cond_value)]
+        res = np.sum(np.exp(s1))
+        s2 = self.data[np.where(self.data >= self.cond_value)]
+        res += np.sum(np.tanh(s2))
+        return res
+
     def compare_ifelse_op_and_numpy(self, place):
         self.set_test_case()
 
@@ -166,10 +175,12 @@ class TestIfElse(unittest.TestCase):
             ie = layers.IfElse(ifcond)
             with ie.true_block():
                 true_target = ie.input(src)
+                true_target = fluid.layers.exp(true_target)
                 ie.output(true_target)
 
             with ie.false_block():
                 false_target = ie.input(src)
+                false_target = fluid.layers.tanh(false_target)
                 ie.output(false_target)
             if_out = ie()
             out = layers.reduce_sum(if_out)
@@ -180,7 +191,8 @@ class TestIfElse(unittest.TestCase):
             o1, = exe.run(fluid.default_main_program(),
                           feed={'data': self.data},
                           fetch_list=[out])
-            o2 = np.sum(self.data)
+            o2 = self.numpy_cal()
+
             self.assertTrue(
                 np.allclose(
                     o1, o2, atol=1e-8),
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 722b3e159abf4737b2bb43c7b84e23a3618cda12..85a96c0b53f6bc08687965048d6251265055a6fe 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -46,7 +46,8 @@ def cnn_model(data):
         pool_size=2,
         pool_stride=2,
         act="relu",
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant()))
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.3)))
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
@@ -54,7 +55,8 @@ def cnn_model(data):
         pool_size=2,
         pool_stride=2,
         act="relu",
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant()))
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.2)))
 
     SIZE = 10
     input_shape = conv_pool_2.shape
@@ -66,8 +68,7 @@ def cnn_model(data):
         size=SIZE,
         act="softmax",
         param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale, seed=1)))
+            initializer=fluid.initializer.Constant(value=0.1)))
     return predict
 
 
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index b0ee6ff9f5941b090e663dac0122cfb575f2f442..0387e911880256ea6b8efb6f2311bbf4c4f8c0f2 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -16,7 +16,6 @@ from __future__ import print_function
 
 import numpy as np
 import argparse
-import six
 import time
 import math
 
@@ -130,7 +129,12 @@ class SE_ResNeXt():
             input=conv, pool_size=7, pool_type='avg', global_pooling=True)
         drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
         stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
-        out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
+        out = fluid.layers.fc(
+            input=drop,
+            size=class_dim,
+            act='softmax',
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.2)))
         return out
 
     def shortcut(self, input, ch_out, stride):
@@ -180,7 +184,7 @@ class SE_ResNeXt():
             act=None,
             # avoid pserver CPU init differs from GPU
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant()),
+                initializer=fluid.initializer.Constant(value=0.2)),
             bias_attr=False)
         return fluid.layers.batch_norm(input=conv, act=act)
 
@@ -229,10 +233,8 @@ class DistSeResneXt2x2(TestDistRunnerBase):
         lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
 
         optimizer = fluid.optimizer.Momentum(
-            # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed.
-            #learning_rate=fluid.layers.piecewise_decay(
-            #    boundaries=bd, values=lr),
-            learning_rate=base_lr,
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
             momentum=0.9,
             regularization=fluid.regularizer.L2Decay(1e-4))
         optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index ab4c5c3f368333ac42781aebd495579b5c26f388..239adcb9d5900d4073a6c07cb189ab7503aea86e 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -265,9 +265,9 @@ def main(role="pserver",
 
 
 if __name__ == "__main__":
-    if len(sys.argv) != 7:
+    if len(sys.argv) != 8:
         print(
-            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
         )
     role = sys.argv[1]
     endpoints = sys.argv[2]
@@ -275,6 +275,8 @@ if __name__ == "__main__":
     current_endpoint = sys.argv[4]
     trainers = int(sys.argv[5])
     is_dist = True if sys.argv[6] == "TRUE" else False
+    # FIXME(typhoonzero): refine this test.
+    is_async = True if sys.argv[7] == "TRUE" else False
     main(
         role=role,
         endpoints=endpoints,
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 972e44c9528a29417d9689dcb2408b9381346f31..44cd073379f293a1114c2c77fa80d35d112d4fb8 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -56,8 +56,8 @@ def get_numeric_gradient(place,
 
     def get_output():
         sum = []
+        op.run(scope, place)
         for output_name in output_names:
-            op.run(scope, place)
             sum.append(
                 np.array(scope.find_var(output_name).get_tensor()).mean())
         return np.array(sum).mean()
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 9be53c4609d5027dd4e482ab936a135420a346ae..74e9d5c5f91e53a315c85d428571ce45bacede8a 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -38,7 +38,8 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   seed=None,
                                   use_parallel_executor=True,
                                   use_reduce=False,
-                                  optimizer=fluid.optimizer.Adam):
+                                  optimizer=fluid.optimizer.Adam,
+                                  use_fast_executor=False):
         def run_executor(exe, feed, fetch_list, program=None):
             if isinstance(exe, fluid.ParallelExecutor):
                 res = exe.run(fetch_list=fetch_list, feed=feed)
@@ -71,6 +72,8 @@ class TestParallelExecutorBase(unittest.TestCase):
             startup_exe.run(startup)
             exec_strategy = fluid.ExecutionStrategy()
             exec_strategy.allow_op_delay = allow_op_delay
+            if use_fast_executor:
+                exec_strategy.use_experimental_executor = True
 
             build_strategy = fluid.BuildStrategy()
             build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7382c2244ec3291c4e8f625cc2d15499e0acdac
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_fusion_lstm_op import fc, ACTIVATION
+from test_softmax_op import stable_softmax
+
+
+def attention_lstm(
+        x,  # T x M
+        lod,  # 1 x N
+        h0,  # N x D
+        c0,  # N x D
+        fcws,  # (M+D) x 1, 1x1
+        fcbs,  # 1 x 1, 1x1
+        w,  # (M+D) x 4D
+        b,  # 1 x 4D
+        act_gate,
+        act_cell,
+        act_cand):
+
+    T = sum(lod[0])
+    N = len(lod[0])
+    M = x.shape[1]
+    D = b.shape[1] / 4
+    assert T == x.shape[0]
+    assert len(fcws) == len(fcbs)
+    hidden = []
+    cell = []
+
+    start_offset = 0
+    for bid in range(N):
+        seq_len = lod[0][bid]
+        xi = np.copy(x[start_offset:start_offset + seq_len, :]).reshape(seq_len,
+                                                                        M)
+        prev_cell = np.copy(c0[bid]).reshape([1, D])
+        prev_hidden = np.copy(h0[bid]).reshape([1, D])
+        for step in range(seq_len):
+            expanded_cell = np.repeat(prev_cell, seq_len, axis=0)
+            tmp = np.concatenate((xi, expanded_cell), axis=1)
+            assert tmp.shape[0] == seq_len
+            assert tmp.shape[1] == M + D
+            for fcid in range(len(fcbs)):
+                tmp = fc(tmp, fcws[fcid], fcbs[fcid])
+                tmp = ACTIVATION['relu'](tmp)
+            tmp = np.reshape(tmp, (1, seq_len))
+            tmp = stable_softmax(tmp).reshape(seq_len, 1)
+            lstmx = xi * tmp  # seq * M
+            lstmx = np.sum(lstmx.reshape(seq_len, M), axis=0).reshape([1, M])
+            lstmin = np.concatenate((prev_hidden, lstmx), axis=1)
+            lstmout = fc(lstmin, w, b).reshape([1, 4 * D])
+
+            g_f, g_i, g_o, cand = np.split(lstmout, 4, axis=1)
+            g_f = act_gate(g_f).reshape([1, D])
+            g_i = act_gate(g_i).reshape([1, D])
+            g_o = act_gate(g_o).reshape([1, D])
+            cand = act_cand(cand).reshape([1, D])
+
+            cell_t = (prev_cell * g_f) + (g_i * cand)
+            hidden_t = g_o * act_cell(cell_t)
+
+            hidden.append(hidden_t.flatten())
+            cell.append(cell_t.flatten())
+
+            prev_cell = cell_t.reshape([1, D])
+            prev_hidden = hidden_t.reshape([1, D])
+
+        start_offset += seq_len
+
+    hidden = np.array(hidden).astype('float32').reshape([T, D])
+    cell = np.array(cell).astype('float32').reshape([T, D])
+    return hidden, cell
+
+
+class TestAttentionLSTMOp(OpTest):
+    def set_conf(self):
+        pass
+
+    def setUp(self):
+        self.op_type = 'attention_lstm'
+        self.lod = [[3]]
+        self.M = 30
+        self.D = 15
+        self.has_initial_hidden = True
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+        self.set_conf()
+
+        T = sum(self.lod[0])
+        bs = len(self.lod[0])
+
+        x = np.random.normal(size=(T, self.M)).astype('float32')
+        c0 = np.random.normal(size=(bs, self.D)).astype('float32')
+        if self.has_initial_hidden:
+            h0 = np.random.normal(size=(bs, self.D)).astype('float32')
+        else:
+            h0 = np.zeros((bs, self.D)).astype('float32')
+
+        fcw1 = np.random.normal(size=(self.M + self.D, 1)).astype('float32')
+        fcb1 = np.random.normal(size=(1, 1)).astype('float32')
+        fcw2 = np.random.normal(size=(1, 1)).astype('float32')
+        fcb2 = np.random.normal(size=(1, 1)).astype('float32')
+
+        # lstm weight and bias
+        w = np.random.normal(size=(self.M + self.D,
+                                   self.D * 4)).astype('float32')
+        b = np.random.normal(size=(1, self.D * 4)).astype('float32')
+
+        h, c = attention_lstm(x, self.lod, h0, c0, [fcw1, fcw2], [fcb1, fcb2],
+                              w, b, ACTIVATION[self.act_gate],
+                              ACTIVATION[self.act_cell],
+                              ACTIVATION[self.act_cand])
+
+        self.inputs = {
+            'X': (x, self.lod),
+            'C0': c0,
+            'AttentionWeight': fcw1,
+            'AttentionBias': fcb1,
+            'AttentionScalar': fcw2,
+            'AttentionScalarBias': fcb2,
+            'LSTMWeight': w,
+            'LSTMBias': b
+        }
+
+        if self.has_initial_hidden:
+            self.inputs['H0'] = h0
+
+        self.outputs = {
+            'Hidden': (h, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAttentionOpNonInit(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.has_initial_hidden = False
+
+
+class TestAttentionOpAct(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 3
+        self.D = 2
+        self.act_gate = 'relu'
+        self.act_cell = 'tanh'
+        self.act_cand = 'sigmoid'
+
+
+class TestAttentionOpMD1(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 36
+        self.D = 8
+
+
+class TestAttentionOpMD2(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 8
+        self.D = 8
+
+
+class TestAttentionOpMD3(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.M = 15
+        self.D = 30
+
+
+class TestAttentionOpBS1(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.lod = [[5]]
+        self.M = 16
+        self.D = 32
+
+
+class TestAttentionOpBS2(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.lod = [[3, 6]]
+
+
+class TestAttentionOpBS5(TestAttentionLSTMOp):
+    def set_conf(self):
+        self.lod = [[3, 2, 4, 7, 5]]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
index 88d44e453c7976f5e0fbda2c0871dfabd4bb30aa..fa6b67956259f33b109758c5939ab5729482695a 100644
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -27,6 +27,7 @@ import unittest
 from multiprocessing import Process
 import os
 import signal
+import six
 import collections
 
 SEED = 1
@@ -55,7 +56,8 @@ def cnn_model(data):
     # TODO(dzhwinter) : refine the initializer and random seed settting
     SIZE = 10
     input_shape = conv_pool_2.shape
-    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    param_shape = [six.moves.reduce(lambda a, b: a * b, input_shape[1:], 1)
+                   ] + [SIZE]
     scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
 
     predict = fluid.layers.fc(
@@ -108,7 +110,7 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
 
 
 def operator_equal(a, b):
-    for k, v in a.__dict__.iteritems():
+    for k, v in six.iteritems(a.__dict__):
         if isinstance(v, fluid.framework.Program) or \
                 isinstance(v, fluid.framework.Block):
             continue
@@ -118,8 +120,8 @@ def operator_equal(a, b):
                 raise ValueError("In operator_equal not equal:{0}\n".format(k))
 
         elif isinstance(v, collections.OrderedDict):
-            v0 = sorted(v.iteritems(), key=lambda x: x[0])
-            v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
+            v0 = sorted(six.iteritems(v), key=lambda x: x[0])
+            v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])
 
             if v0 != v1:
                 raise ValueError("In operator_equal not equal:{0}\n".format(k))
@@ -131,7 +133,7 @@ def operator_equal(a, b):
 
 
 def block_equal(a, b):
-    for k, v in a.__dict__.iteritems():
+    for k, v in six.iteritems(a.__dict__):
         if isinstance(v, core.ProgramDesc) or isinstance(
                 v, fluid.framework.Program) or isinstance(v, core.BlockDesc):
             continue
@@ -143,8 +145,8 @@ def block_equal(a, b):
             assert (len(a.ops) == len(b.ops))
 
         elif isinstance(v, collections.OrderedDict):
-            v0 = sorted(v.iteritems(), key=lambda x: x[0])
-            v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
+            v0 = sorted(six.iteritems(v), key=lambda x: x[0])
+            v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])
 
             if v0 != v1:
                 raise ValueError("In block_equal not equal:{0}\n".format(k))
@@ -156,7 +158,7 @@ def block_equal(a, b):
 
 
 def program_equal(a, b):
-    for k, v in a.__dict__.iteritems():
+    for k, v in six.iteritems(a.__dict__):
         if isinstance(v, core.ProgramDesc):
             continue
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 4c71181d0d736bd1a8796b2d38ed1667557e3db8..0e815c91446b285ba2c2c5aa9ad18d97f51eae65 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -30,7 +30,7 @@ class TestDistRunnerBase(object):
             "get_model should be implemented by child classes.")
 
     def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
-                       trainers):
+                       trainers, sync_mode):
         # NOTE: import fluid until runtime, or else forking processes will cause error.
         import paddle
         import paddle.fluid as fluid
@@ -39,17 +39,22 @@ class TestDistRunnerBase(object):
             trainer_id=trainer_id,
             program=main_program,
             pservers=pserver_endpoints,
-            trainers=trainers)
+            trainers=trainers,
+            sync_mode=sync_mode)
         return t
 
-    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
-                    trainer_id):
+    def run_pserver(self,
+                    pserver_endpoints,
+                    trainers,
+                    current_endpoint,
+                    trainer_id,
+                    sync_mode=True):
         import paddle
         import paddle.fluid as fluid
         self.get_model(batch_size=2)
         t = self.get_transpiler(trainer_id,
                                 fluid.default_main_program(), pserver_endpoints,
-                                trainers)
+                                trainers, sync_mode)
         pserver_prog = t.get_pserver_program(current_endpoint)
         startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
         place = fluid.CPUPlace()
@@ -57,7 +62,13 @@ class TestDistRunnerBase(object):
         exe.run(startup_prog)
         exe.run(pserver_prog)
 
-    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+    def run_trainer(self,
+                    place,
+                    endpoints,
+                    trainer_id,
+                    trainers,
+                    is_dist=True,
+                    sync_mode=True):
         import paddle
         import paddle.fluid as fluid
         test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
@@ -65,7 +76,7 @@ class TestDistRunnerBase(object):
         if is_dist:
             t = self.get_transpiler(trainer_id,
                                     fluid.default_main_program(), endpoints,
-                                    trainers)
+                                    trainers, sync_mode)
             trainer_prog = t.get_trainer_program()
         else:
             trainer_prog = fluid.default_main_program()
@@ -106,9 +117,9 @@ def runtime_main(test_class):
     import paddle.fluid as fluid
     import paddle.fluid.core as core
 
-    if len(sys.argv) != 7:
+    if len(sys.argv) != 8:
         print(
-            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
         )
     role = sys.argv[1]
     endpoints = sys.argv[2]
@@ -116,34 +127,43 @@ def runtime_main(test_class):
     current_endpoint = sys.argv[4]
     trainers = int(sys.argv[5])
     is_dist = True if sys.argv[6] == "TRUE" else False
+    sync_mode = True if sys.argv[7] == "TRUE" else False
 
     model = test_class()
     if role == "pserver":
-        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id,
+                          sync_mode)
     else:
         p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
-        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist,
+                          sync_mode)
 
 
 import paddle.compat as cpt
 
 
 class TestDistBase(unittest.TestCase):
+    def _setup_config(self):
+        raise NotImplementedError("tests should have _setup_config implemented")
+
     def setUp(self):
         self._trainers = 2
         self._pservers = 2
         self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
         self._python_interp = "python"
+        self._sync_mode = True
+        self._setup_config()
 
     def start_pserver(self, model_file, check_error_log):
+        sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
             (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
-             self._trainers)
-        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+             self._trainers, sync_mode_str)
+        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
             (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
-             self._trainers)
+             self._trainers, sync_mode_str)
 
         ps0_pipe = subprocess.PIPE
         ps1_pipe = subprocess.PIPE
@@ -195,9 +215,10 @@ class TestDistBase(unittest.TestCase):
         # Run local to get a base line
         env_local = {"CUDA_VISIBLE_DEVICES": "0"}
         env_local.update(required_envs)
-        local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
+        sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
+        local_cmd = "%s %s trainer %s 0 %s %d FLASE %s" % \
             (self._python_interp, model_file,
-             "127.0.0.1:1234", "127.0.0.1:1234", 1)
+             "127.0.0.1:1234", "127.0.0.1:1234", 1, sync_mode_str)
         if not check_error_log:
             local_proc = subprocess.Popen(
                 local_cmd.split(" "),
@@ -226,12 +247,12 @@ class TestDistBase(unittest.TestCase):
         self._wait_ps_ready(ps1.pid)
 
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \
+        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE %s" % \
             (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
-             self._trainers)
-        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \
+             self._trainers, sync_mode_str)
+        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE %s" % \
             (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
-             self._trainers)
+             self._trainers, sync_mode_str)
 
         env0 = {"CUDA_VISIBLE_DEVICES": "0"}
         env1 = {"CUDA_VISIBLE_DEVICES": "1"}
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index 4ec68d411b0f0e9ae89b107914e8fd844a19228b..36bab6f04603b7ad3218603489eead859bfcb5b6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -17,10 +17,21 @@ import unittest
 from test_dist_base import TestDistBase
 
 
-class TestDistSeResneXt2x2(TestDistBase):
+class TestDistMnist2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
     def test_se_resnext(self):
         self.check_with_place("dist_mnist.py", delta=1e-7)
 
 
+class TestDistMnistAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_mnist.py", delta=200)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index 16525f6fdb60a90a44a628fb0648f4130218c102..c0e9fa38e7d1eadd89eff9a8ba4442f888b8120e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase
 
 
 class TestDistSeResneXt2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
     def test_se_resnext(self):
         self.check_with_place("dist_se_resnext.py", delta=1e-7)
 
 
+class TestDistSeResneXt2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_se_resnext.py", delta=100)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
index 313207ff9ce054f81322224cb6ceafaaf25bbedf..62fcf5953f93637a20beed649de21476a8673419 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -19,6 +19,9 @@ from test_dist_base import TestDistBase
 
 
 class TestDistTransformer2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
     def test_transformer(self):
         # TODO(paddle-dev): check if the delta is OK.
         # Usually start around ~8000 and converge to ~5000
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 03083c9960636886ffa49137b9e9d4a71fbd72fe..9f04d290f7596a60d5fdfa66cbc4beec1c3fe93d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -47,7 +47,6 @@ class TranspilerTest(unittest.TestCase):
         avg_cost = fluid.layers.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
         sgd_optimizer.minimize(avg_cost)
-        return
 
     def get_main_program(self):
         main = fluid.Program()
@@ -95,8 +94,9 @@ class TranspilerTest(unittest.TestCase):
     def test_transpiler(self):
         main = fluid.Program()
         startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            self.transpiler_test_impl()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.transpiler_test_impl()
 
 
 class TestBasicModel(TranspilerTest):
@@ -249,7 +249,6 @@ class TestLRDecay(TranspilerTest):
                 decay_rate=0.1,
                 staircase=True))
         sgd_optimizer.minimize(avg_cost)
-        return
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -279,7 +278,6 @@ class TestLRDecayConditional(TranspilerTest):
             learning_rate=fluid.layers.piecewise_decay([10000, 20000],
                                                        [1.0, 0.5, 1.0]))
         sgd_optimizer.minimize(avg_cost)
-        return
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -328,7 +326,6 @@ class TestL2Decay(TranspilerTest):
         avg_cost = fluid.layers.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
         sgd_optimizer.minimize(avg_cost)
-        return
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -363,7 +360,6 @@ class TestL2DecayWithPiecewise(TranspilerTest):
             momentum=0.9,
             regularization=fluid.regularizer.L2Decay(1e-4))
         sgd_optimizer.minimize(avg_cost)
-        return
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -393,13 +389,14 @@ class TestDistLookupTableBase(TranspilerTest):
     def network_with_table(self, is_sparse, is_distributed):
         self.table_size = 1000
         self.emb_size = 64
+        self.lookup_table_name = 'shared_w'
 
         def emb_pool(ids):
             emb = fluid.layers.embedding(
                 input=ids,
                 size=[self.table_size, self.emb_size],
                 dtype='float32',
-                param_attr='shared_w',  # share parameter
+                param_attr=self.lookup_table_name,  # share parameter
                 is_sparse=is_sparse,
                 is_distributed=is_distributed)
             pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
@@ -572,7 +569,7 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase):
 
     def transpiler_test_impl(self):
         config = fluid.DistributeTranspilerConfig()
-        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config)
+        pserver1, _ = self.get_pserver(self.pserver1_ep, config)
 
         self.assertTrue(self.transpiler.has_distributed_lookup_table)
         lookup_table_var = pserver1.global_block().vars[
@@ -582,6 +579,21 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase):
         self.assertEqual(row_size, calc_row_size)
 
 
+class TestDistArgsInProgram(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        trainer, _ = self.get_trainer()
+
+        self.assertTrue(trainer._is_distributed)
+        self.assertTrue(trainer._is_chief)
+        self.assertEqual(trainer._distributed_lookup_table,
+                         self.lookup_table_name)
+        self.assertEqual(trainer._endpoints,
+                         [self.pserver1_ep, self.pserver2_ep])
+
+
 class TestRMSPropOptimizer(TranspilerTest):
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
@@ -595,7 +607,6 @@ class TestRMSPropOptimizer(TranspilerTest):
         avg_cost = fluid.layers.mean(cost)
         optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
         optimizer.minimize(avg_cost)
-        return
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -612,5 +623,40 @@ class TestRMSPropOptimizer(TranspilerTest):
         self.assertEqual(moment_var.shape, (500, 1000))
 
 
+class TestLoadSliceVar(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
+        optimizer.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        pserver, _ = self.get_pserver(self.pserver1_ep)
+        pserver2, _ = self.get_pserver(self.pserver2_ep)
+
+        self.assertTrue(pserver._slice_vars_and_attrs)
+        self.assertTrue(pserver2._slice_vars_and_attrs)
+
+        for idx in xrange(len(pserver._slice_vars_and_attrs)):
+            self.assertEqual(pserver._slice_vars_and_attrs[idx][0],
+                             pserver2._slice_vars_and_attrs[idx][0])
+
+            total_numel = reduce(lambda x, y: x * y,
+                                 pserver._slice_vars_and_attrs[idx][0].shape)
+            self.assertEqual(
+                total_numel,
+                reduce(lambda x, y: x * y,
+                       pserver._slice_vars_and_attrs[idx][2].shape) + reduce(
+                           lambda x, y: x * y,
+                           pserver2._slice_vars_and_attrs[idx][2].shape))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
index e43992c488d35d1b3f670e13650d420b0498eeec..38af149ad336fcb818c3cbc9c686bcbdf00238be 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase
 
 
 class TestDistSeResneXt2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
     def test_se_resnext(self):
         self.check_with_place("dist_word2vec.py", delta=1e-7)
 
 
+class TestDistSeResneXt2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_word2vec.py", delta=1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index 2bb920710a9b10f3a8159bad3b33dd15ffbada19..ff417ad2f16b83cd42a0603375c14450195e7fc0 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -64,27 +64,47 @@ class TestFCOp(OpTest):
         self.check_output()
 
 
-class TestFCOpBiasBoth(TestFCOp):
+class TestFCOpNoBias(TestFCOp):
     def init_shapes(self, mb, ic, oc, h, w):
-        for with_bias in {True, False}:
-            self.with_bias = with_bias
-            self.matrix = MatrixGenerate(mb, ic, oc, h, w)
+        self.with_bias = False
+        self.matrix = MatrixGenerate(mb, ic, oc, h, w)
 
 
-class TestFCOp1(TestFCOpBiasBoth):
+class TestFCOpWithBias(TestFCOp):
+    def init_shapes(self, mb, ic, oc, h, w):
+        self.with_bias = True
+        self.matrix = MatrixGenerate(mb, ic, oc, h, w)
+
+
+class TestFCOp1(TestFCOpNoBias):
     def init_op_type(self):
         self.init_shapes(2, 8, 10, 1, 1)
 
 
-class TestFCOp2(TestFCOpBiasBoth):
+class TestFCOp2(TestFCOpNoBias):
     def init_op_type(self):
         self.init_shapes(4, 5, 6, 2, 2)
 
 
-class TestFCOp4(TestFCOpBiasBoth):
+class TestFCOp4(TestFCOpNoBias):
     def init_op_type(self):
         self.init_shapes(1, 32, 64, 3, 3)
 
 
+class TestFCOpWithBias1(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(3, 8, 10, 2, 1)
+
+
+class TestFCOpWithBias2(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(4, 5, 6, 2, 2)
+
+
+class TestFCOpWithBias3(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(1, 64, 32, 3, 3)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d8bef677fd16fb6bdc20b929137b4d885f4efd1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -0,0 +1,151 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_lstm_op import lstm, ACTIVATION
+
+
+def fc(x, w, b):
+    return np.dot(x, w) + b
+
+
+def fusion_lstm(
+        x,  # T x M
+        lod,  # 1 x N
+        wx=None,  # M x 4D
+        bx=None,  # 1 x 4D
+        h0=None,  # N x D
+        c0=None,  # N x D
+        w_h=None,  # D x 4D
+        w_b=None,  # 1 x 4D
+        w_c=None,  # 1 x 3D
+        is_reverse=False,
+        act_gate=None,
+        act_cell=None,
+        act_cand=None):
+    return lstm(
+        fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate,
+        act_cell, act_cand)
+
+
+class TestLstmOp(OpTest):
+    def set_argument(self):
+        self.lod = [[2, 3, 2]]
+
+    def setUp(self):
+        self.op_type = 'fusion_lstm'
+        self.lod = [[2, 3, 2]]
+        self.M = 8
+        self.D = 16
+        self.has_initial_state = False
+        self.is_reverse = False
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+        self.use_peepholes = False
+        self.set_argument()
+
+        T = sum(self.lod[0])
+        bs = len(self.lod[0])
+
+        x = np.random.normal(size=(T, self.M)).astype('float64')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(bs, self.D)).astype('float64')
+            c0 = np.random.normal(size=(bs, self.D)).astype('float64')
+        else:
+            h0 = np.zeros((bs, self.D)).astype('float64')
+            c0 = np.zeros((bs, self.D)).astype('float64')
+
+        wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float64')
+
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+        w_b = np.copy(b[:, 0:4 * self.D])
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
+
+        # this is the weight of fc
+        wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float64')
+        # this is the bias of fc
+        # and it should be manually added into the bias of this fusion LSTM
+        bx = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+        b[0, 0:4 * self.D] += bx[0, :]
+        h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c,
+                           self.is_reverse, ACTIVATION[self.act_gate],
+                           ACTIVATION[self.act_cell], ACTIVATION[self.act_cand])
+
+        self.inputs = {
+            'X': (x, self.lod),
+            'WeightX': wx,
+            'WeightH': wh,
+            'Bias': b
+        }
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        self.outputs = {
+            'Hidden': (h, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
+        }
+
+    def test_check_output(self):
+        self.check_output(atol=1e-8)
+
+
+class TestLstmOpInitReverse(TestLstmOp):
+    def set_argument(self):
+        self.has_initial_state = True
+        self.is_reverse = True
+
+
+class TestLstmOpMD1(TestLstmOp):
+    def set_argument(self):
+        self.M = 36
+        self.D = 8
+
+
+class TestLstmOpMD2(TestLstmOp):
+    def set_argument(self):
+        self.M = 8
+        self.D = 8
+
+
+class TestLstmOpMD3(TestLstmOp):
+    def set_argument(self):
+        self.M = 15
+        self.D = 3
+
+
+class TestLstmOpBS1(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[3]]
+        self.D = 16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 4994e11d1fb4fa21aeb2a7212cd64daf643adeb8..c45ccee4bd16eeef9b21c221221481be3029510a 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -347,6 +347,25 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(loss)
         print(str(program))
 
+    def test_scatter(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name='x',
+                shape=[3, 3],
+                append_batch_size=False,
+                dtype='float32')
+            idx = layers.data(
+                name='idx', shape=[2], append_batch_size=False, dtype='int32')
+            updates = layers.data(
+                name='updates',
+                shape=[2, 3],
+                append_batch_size=False,
+                dtype='float32')
+            out = layers.scatter(input=x, index=idx, updates=updates)
+            self.assertIsNotNone(out)
+        print(str(program))
+
     def test_lod_reset(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
index 7f75d0e6e9c2ed12eb4a4a7b3cd68e685c711f76..11e5d8b536fb65b66c954991bf815241774702ec 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
@@ -21,36 +21,27 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
-def output_hist(out):
-    hist, _ = np.histogram(out, range=(-5, 10))
-    hist = hist.astype("float32")
-    hist /= float(out.size)
-    prob = 0.1 * np.ones((10))
-    return hist, prob
-
-
 class TestLookupSpraseTable(OpTest):
     def check_with_place(self, place):
         scope = core.Scope()
 
-        # create and initialize Id Variable
-        ids = scope.var("Ids").get_tensor()
-        ids_array = np.array([0, 2, 3, 5, 100]).astype("int64")
-        ids.set(ids_array, place)
-
         # create and initialize W Variable
-        rows = [0, 1, 2, 3, 4, 5, 6]
-        row_numel = 10000
+        table_size = 10000
+        row_numel = 8
 
         w_selected_rows = scope.var('W').get_selected_rows()
-        w_selected_rows.set_height(len(rows))
-        w_selected_rows.set_rows(rows)
-        w_array = np.ones((len(rows), row_numel)).astype("float32")
-        for i in range(len(rows)):
+        w_selected_rows.set_height(table_size)
+        w_array = np.ones((table_size, row_numel)).astype("float32")
+        for i in range(table_size):
             w_array[i] *= i
         w_tensor = w_selected_rows.get_tensor()
         w_tensor.set(w_array, place)
 
+        # create and initialize Id Variable
+        ids = scope.var("Ids").get_tensor()
+        ids_array1 = np.array([0, 2, 3, 2, 5, 0, 100]).astype("int64")
+        ids.set(ids_array1, place)
+
         # create Out Variable
         out_tensor = scope.var('Out').get_tensor()
 
@@ -66,16 +57,28 @@ class TestLookupSpraseTable(OpTest):
         lookup_table.run(scope, place)
 
         # get result from Out
-        result_array = np.array(out_tensor)
+        result_array1 = np.array(out_tensor)
         # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array[:-2]):
-            assert (row == result_array[idx]).all()
+        assert (result_array1[0] == w_array[0]).all()
+        assert (result_array1[1] == w_array[1]).all()
+        assert (result_array1[2] == w_array[2]).all()
+        assert (result_array1[3] == w_array[1]).all()
+        assert (result_array1[4] == w_array[3]).all()
+        assert (result_array1[5] == w_array[0]).all()
+        assert (result_array1[6] == w_array[4]).all()
+
+        # create and initialize Id Variable
+        ids = scope.var("Ids").get_tensor()
+        ids_array2 = np.array([4, 2, 3, 7, 100000]).astype("int64")
+        ids.set(ids_array2, place)
+        lookup_table.run(scope, place)
 
-        # check the random value
-        hist, prob = output_hist(result_array[-1])
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        result_array2 = np.array(out_tensor)
+        assert (result_array2[0] == w_array[5]).all()
+        assert (result_array2[1] == w_array[1]).all()
+        assert (result_array2[2] == w_array[2]).all()
+        assert (result_array2[3] == w_array[6]).all()
+        assert (result_array2[4] == w_array[7]).all()
 
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 6d01955993324498de42462b7f85ef6f8e444505..3ac82680733feb4b82ab98669269160e4aad948f 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -67,7 +67,10 @@ class TestOperator(unittest.TestCase):
         self.assertEqual(mul_op.output("Out"), ["mul.out"])
         self.assertEqual(
             set(mul_op.attr_names),
-            set(["x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var"]))
+            set([
+                "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
+                "op_callstack"
+            ]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
         self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 893acd763fb21fc5040b8a24700eb947a9fe37c6..5b96d641d667eee1aa0c7c6019bf92494f777259 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -183,7 +183,9 @@ class TestMNIST(TestParallelExecutorBase):
             use_parallel_executor=True)
 
         self.assertAlmostEquals(
-            np.mean(parallel_first_loss), single_first_loss, delta=1e-6)
+            np.mean(parallel_first_loss),
+            single_first_loss,
+            delta=1e-6, )
         self.assertAlmostEquals(
             np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
 
@@ -191,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_simple_fc_parallel_accuracy(True)
         self.check_simple_fc_parallel_accuracy(False)
 
-    def check_batchnorm_fc_convergence(self, use_cuda):
+    def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
@@ -203,11 +205,13 @@ class TestMNIST(TestParallelExecutorBase):
             fc_with_batchnorm,
             feed_dict={"image": img,
                        "label": label},
-            use_cuda=use_cuda)
+            use_cuda=use_cuda,
+            use_fast_executor=use_fast_executor)
 
     def test_batchnorm_fc(self):
-        self.check_batchnorm_fc_convergence(True)
-        self.check_batchnorm_fc_convergence(False)
+        for use_cuda in (False, True):
+            for use_fast_executor in (False, True):
+                self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor)
 
     def test_batchnorm_fc_with_new_strategy(self):
         # FIXME(zcd): close this test temporally.
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 38a7c913bf07a32494089addf17068017d24d628..7934164b84931f886967982ce0cb65c406bbf800 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -25,9 +25,6 @@ import paddle.fluid.core as core
 
 class TestProfiler(unittest.TestCase):
     def net_profiler(self, state, profile_path='/tmp/profile'):
-        enable_if_gpu = state == 'GPU' or state == "All"
-        if enable_if_gpu and not core.is_compiled_with_cuda():
-            return
         startup_program = fluid.Program()
         main_program = fluid.Program()
 
@@ -81,8 +78,6 @@ class TestProfiler(unittest.TestCase):
                 pass_acc_calculator.add(value=acc, weight=b_size)
                 pass_acc = pass_acc_calculator.eval()
 
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "profiler is enabled only with GPU")
     def test_cpu_profiler(self):
         self.net_profiler('CPU')
 
diff --git a/python/paddle/fluid/tests/unittests/test_program_code.py b/python/paddle/fluid/tests/unittests/test_program_code.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9c2b928617dce3904ca119896ca81454256e82e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_program_code.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import unittest
+from multiprocessing import Process
+import signal
+
+import numpy
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid.layers.io import ListenAndServ
+from paddle.fluid.layers.io import Recv
+from paddle.fluid.layers.io import Send
+
+from paddle.fluid.transpiler.details import program_to_code
+
+
+class TestProgram2Code(unittest.TestCase):
+    def test_print(self):
+        place = fluid.CPUPlace()
+        self.init_serv(place)
+        self.init_client(place, 9123)
+
+    def init_serv(self, place):
+        main = fluid.Program()
+
+        with fluid.program_guard(main):
+            serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
+            with serv.do():
+                out_var = main.global_block().create_var(
+                    name="scale_0.tmp_0",
+                    psersistable=True,
+                    dtype="float32",
+                    shape=[32, 32])
+                x = layers.data(
+                    shape=[32, 32],
+                    dtype='float32',
+                    name="X",
+                    append_batch_size=False)
+                fluid.initializer.Constant(value=1.0)(x, main.global_block())
+                layers.scale(x=x, scale=10.0, out=out_var)
+
+        program_to_code(main)
+
+    def init_client(self, place, port):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = layers.data(
+                shape=[32, 32],
+                dtype='float32',
+                name='X',
+                append_batch_size=False)
+            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            get_var = main.global_block().create_var(
+                name="scale_0.tmp_0",  # server side var
+                dtype="float32",
+                persistable=False,
+                shape=[32, 32])
+            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
+            Send("127.0.0.1:%d" % port, [x])
+            o = Recv("127.0.0.1:%d" % port, [get_var])
+
+        program_to_code(main)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index c14a83b4bbcda326f891d809018df0da39978932..b46e4bfb86bd5dc9c74375693328f2506281be3e 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -126,6 +126,7 @@ class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
         w_selected_rows = scope.var('Param').get_selected_rows()
         w_selected_rows.set_height(len(param_rows))
         w_selected_rows.set_rows(param_rows)
+        w_selected_rows.sync_index()
         w_array = np.ones((len(param_rows), row_width)).astype("float32")
         for i in range(len(param_rows)):
             w_array[i] *= i
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index a2a5584459a5d4dc416b9c542a4bb0567982e765..2be8e24a0fae6945351eb767ac924d7ca70848ab 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -41,7 +41,7 @@ class TestSqueezeOp(OpTest):
         self.new_shape = (3, 5)
 
     def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": False}
+        self.attrs = {"axes": self.axes}
 
 
 # Correct: There is mins axis.
@@ -68,49 +68,5 @@ class TestSqueezeOp3(TestSqueezeOp):
         self.new_shape = (3, 5, 1, 4)
 
 
-# Correct: Inplace.
-class TestSqueezeOpInplace1(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = (0, 2)
-        self.new_shape = (3, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. There is mins axis.
-class TestSqueezeOpInplace2(TestSqueezeOp):
-    def inti_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = (0, -2)
-        self.new_shape = (3, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. No axes input.
-class TestSqueezeOpInplace3(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = ()
-        self.new_shape = (3, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inpalce. Just part of axes be squeezed. 
-class TestSqueezeOpInplace4(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 1, 5, 1, 4, 1)
-        self.axes = (1, -1)
-        self.new_shape = (3, 5, 1, 4)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..defdeb5d70df4c39ed8e23247270e6eb3dd14a7a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_stack_op.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest
+import numpy as np
+import unittest
+
+
+class TestStackOpBase(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'float32'
+
+    def initParameters(self):
+        pass
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = 'stack'
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(self.get_x_names(), 'Y')
+
+
+class TestStackOp1(TestStackOpBase):
+    def initParameters(self):
+        self.num_inputs = 16
+
+
+class TestStackOp2(TestStackOpBase):
+    def initParameters(self):
+        self.num_inputs = 20
+
+
+class TestStackOp3(TestStackOpBase):
+    def initParameters(self):
+        self.axis = -1
+
+
+class TestStackOp4(TestStackOpBase):
+    def initParameters(self):
+        self.axis = -4
+
+
+class TestStackOp5(TestStackOpBase):
+    def initParameters(self):
+        self.axis = 1
+
+
+class TestStackOp6(TestStackOpBase):
+    def initParameters(self):
+        self.axis = 3
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 9dc93048e631801acd3b21a3bb140ecc4fcd7732..74797bb65678404b7b35d06eecc7f9a12b2a346e 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 
 
 class TestSumOp(OpTest):
@@ -42,5 +44,66 @@ class TestSumOp(OpTest):
         pass
 
 
+class TestSelectedRowsSumOp(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        self.check_input_and_optput(scope, place, True, True, True)
+        self.check_input_and_optput(scope, place, False, True, True)
+        self.check_input_and_optput(scope, place, False, False, True)
+        self.check_input_and_optput(scope, place, False, False, False)
+
+    def check_input_and_optput(self,
+                               scope,
+                               place,
+                               w1_has_data=False,
+                               w2_has_data=False,
+                               w3_has_data=False):
+
+        self.create_selected_rows(scope, place, "W1", w1_has_data)
+        self.create_selected_rows(scope, place, "W2", w2_has_data)
+        self.create_selected_rows(scope, place, "W3", w3_has_data)
+
+        # create Out Variable
+        out = scope.var('Out').get_selected_rows()
+
+        # create and run sum operator
+        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out='Out')
+        sum_op.run(scope, place)
+
+        has_data_w_num = 0
+        for w in [w1_has_data, w2_has_data, w3_has_data]:
+            if not w:
+                has_data_w_num += 1
+
+        self.assertEqual(7 * has_data_w_num, len(out.rows()))
+
+    def create_selected_rows(self, scope, place, var_name, isEmpty):
+        # create and initialize W Variable
+        if not isEmpty:
+            rows = [0, 1, 2, 3, 4, 5, 6]
+            row_numel = 12
+        else:
+            rows = []
+            row_numel = 12
+
+        var = scope.var(var_name)
+        w_selected_rows = var.get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        return var
+
+    def test_w_is_selected_rows(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 5fcabe4c83457a91ff0e9cd5568904698353b62a..a324438ba5a3c3b57fd956bd11189ef7d50267e2 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -41,7 +41,7 @@ class TestUnsqueezeOp(OpTest):
         self.new_shape = (3, 1, 1, 5)
 
     def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": False}
+        self.attrs = {"axes": self.axes}
 
 
 # Correct: Single input index.
@@ -76,38 +76,5 @@ class TestUnsqueezeOp4(TestUnsqueezeOp):
         self.new_shape = (3, 1, 1, 2, 5, 1)
 
 
-# Correct: Inplace.
-class TestUnsqueezeOpInplace1(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (0, 2)
-        self.new_shape = (1, 3, 1, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. There is mins index.
-class TestUnsqueezeOpInplace2(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (0, -2)
-        self.new_shape = (1, 3, 1, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. There is duplicated axis.
-class TestUnsqueezeOpInplace3(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 2, 5)
-        self.axes = (0, 3, 3)
-        self.new_shape = (1, 3, 2, 1, 1, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index 31ae25f02c6fab0d68e83a131ace054d4eb56c12..34fbb1b549cf5fc5f75bcc0715e5c83665f1d200 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -153,9 +153,6 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
 
 def append_loss_ops(block, output_names):
     mean_inputs = list(map(block.var, output_names))
-    # for item in mean_inputs:
-    #     print(item)
-    #     print("Item", item.dtype)
 
     if len(mean_inputs) == 1:
         loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 294308f1877360174c69cc59b0d2037e494985e7..d094647afe1900809fc32cae93f777765f72c675 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -285,11 +285,12 @@ class Trainer(object):
             self._load_checkpoint()
 
         if param_path and os.path.isdir(param_path):
-            # load params from param_path into scope
-            io.load_persistables(
-                executor=exe,
-                dirname=param_path,
-                main_program=self.startup_program)
+            with self._prog_and_scope_guard():
+                # load params from param_path into scope
+                io.load_persistables(
+                    executor=exe,
+                    dirname=param_path,
+                    main_program=self.startup_program)
 
     def _transpile_nccl2_dist(self):
         # PADDLE_TRAINER_IPS
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index 640dbf4bbed58edf746456419af18c75241fa03c..420ae6dfd4b75b507dd01bb947fa707bca5cdb08 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -16,6 +16,9 @@ from __future__ import print_function
 
 import six
 
+from paddle.fluid import core
+import paddle
+
 
 def delete_ops(block, ops):
     try:
@@ -39,3 +42,133 @@ def find_op_by_output_arg(block, arg_name):
         if arg_name in op.output_arg_names:
             return index
     return -1
+
+
+def get_indent_space(indent, space_num=4):
+    ret = ""
+    for i in range(0, indent * space_num):
+        ret += " "
+
+    return ret
+
+
+def variable_to_code(var):
+    """
+    Get readable codes of fluid variable.
+
+    Args:
+        var: A fluid operator.
+
+    Returns:
+        string: The formatted string.
+    """
+
+    var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})".\
+        format(i="{", e="}", name=var.name, type=var.type, shape=var.shape, dtype=var.dtype)
+
+    if type(var) == paddle.fluid.framework.Parameter:
+        if var.trainable:
+            var_str = "trainable parameter " + var_str
+        else:
+            var_str = "parameter " + var_str
+    else:
+        var_str = "var " + var_str
+
+    if var.persistable:
+        var_str = "persist " + var_str
+
+    return var_str
+
+
+def op_to_code(op):
+    """
+    Get readable codes of fluid operator.
+
+    Args:
+        op: A fluid operator.
+
+    Returns:
+        string: The foramtted string.
+    """
+
+    outputs_str = "{"
+    for i in range(0, len(op.output_names)):
+        outputs_str += "{name}=".format(name=op.output_names[i])
+        o = op.output(op.output_names[i])
+        outputs_str += "{value}".format(value=o)
+        if i != len(op.output_names) - 1:
+            outputs_str += ", "
+    outputs_str += "}"
+
+    inputs_str = "{"
+    for i in range(0, len(op.input_names)):
+        inputs_str += "{name}=".format(name=op.input_names[i])
+        o = op.input(op.input_names[i])
+        inputs_str += "{value}".format(value=o)
+
+        if i != len(op.input_names) - 1:
+            inputs_str += ", "
+    inputs_str += "}"
+
+    attrs_str = ""
+    for i in range(0, len(op.attr_names)):
+        name = op.attr_names[i]
+
+        attr_type = op.desc.attr_type(name)
+        if attr_type == core.AttrType.BLOCK:
+            a = "{name} = block[{value}]".format(
+                name=name, type=attr_type, value=op.block_attr_id(name))
+            attrs_str += a
+            continue
+
+        if attr_type == core.AttrType.BLOCKS:
+            a = "{name} = blocks{value}".format(
+                name=name, type=attr_type, value=op.blocks_attr_ids(name))
+            attrs_str += a
+            continue
+
+        a = "{name} = {value}".format(
+            name=name, type=attr_type, value=op.desc.attr(name))
+        attrs_str += a
+        if i != len(op.attr_names) - 1:
+            attrs_str += ", "
+
+    if outputs_str != "{}":
+        op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".\
+            format(outputs = outputs_str, op_type=op.type, inputs=inputs_str, attrs=attrs_str)
+    else:
+        op_str = "{op_type}(inputs={inputs}, {attrs})".\
+            format(op_type=op.type, inputs=inputs_str, attrs=attrs_str)
+    return op_str
+
+
+def program_to_code(prog):
+    """
+    Print readable codes of fluid program.
+
+    Args:
+        prog : A fluid program.
+
+    An example result like bellow:
+    https://github.com/PaddlePaddle/Paddle/pull/12673
+    """
+    indent = 0
+    block_idx = 0
+    for block in prog.blocks:
+        print("{0}{1} // block {2}".format(
+            get_indent_space(indent), '{', block_idx))
+        indent += 1
+        # sort all vars
+        all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0])
+        for var in all_vars:
+            print("{}{}".format(
+                get_indent_space(indent), variable_to_code(var[1])))
+
+        if len(all_vars) > 0:
+            print("")
+
+        for op in block.ops:
+            print("{}{}".format(get_indent_space(indent), op_to_code(op)))
+        indent -= 1
+        print("{0}{1}".format(get_indent_space(indent), '}'))
+        block_idx += 1
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 49d1b6c7b4e8fe353a4e849a407bc988183c96eb..80d9758b3dd0d6c57adc888c492cac26da3939d0 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -34,6 +34,7 @@ import math
 import random
 import numpy as np
 import collections
+import six
 
 from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
@@ -210,7 +211,19 @@ class DistributeTranspiler(object):
 
         ps_dispatcher = self.config.split_method(self.pserver_endpoints)
         self.has_distributed_lookup_table = self._has_distributed_lookup_table()
-
+        self.param_name_to_grad_name = dict()
+        self.grad_name_to_param_name = dict()
+        for param_var, grad_var in self.params_grads:
+            self.param_name_to_grad_name[param_var.name] = grad_var.name
+            self.grad_name_to_param_name[grad_var.name] = param_var.name
+
+        # add distributed attrs to program
+        self.origin_program._is_distributed = True
+        self.origin_program._endpoints = self.pserver_endpoints
+        self.origin_program._is_chief = self.trainer_id == 0
+        self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None
+
+        # split and create vars, then put splited vars in dicts for later use.
         # step 1: split and create vars, then put splited vars in dicts for later use.
         self._init_splited_vars()
 
@@ -229,34 +242,43 @@ class DistributeTranspiler(object):
             random.seed(self.origin_program.random_seed)
             random.shuffle(grad_var_mapping_items)
 
-        for orig_varname, splited_vars in grad_var_mapping_items:
+        grad_name_to_send_dummy_out = dict()
+        for grad_varname, splited_vars in grad_var_mapping_items:
             eplist = ps_dispatcher.dispatch(splited_vars)
 
             if not self.config.slice_var_up:
                 assert (len(splited_vars) == 1)
 
+            splited_grad_varname = grad_varname
             if len(splited_vars) == 1:
-                orig_varname = splited_vars[0].name
+                splited_grad_varname = splited_vars[0].name
                 index = find_op_by_output_arg(program.global_block(),
-                                              orig_varname)
+                                              splited_grad_varname)
             elif len(splited_vars) > 1:
-                orig_var = program.global_block().vars[orig_varname]
+                orig_var = program.global_block().vars[splited_grad_varname]
                 index = find_op_by_output_arg(program.global_block(),
-                                              orig_varname)
+                                              splited_grad_varname)
                 self._insert_split_op(program, orig_var, index, splited_vars)
                 index += 1
             else:
                 AssertionError("Can not insert the send op by original "
-                               "variable name :", orig_varname)
+                               "variable name :", splited_grad_varname)
+
+            dummy_output = program.global_block().create_var(
+                name=framework.generate_control_dev_var_name())
+            grad_name_to_send_dummy_out[grad_varname] = dummy_output
 
             program.global_block()._insert_op(
                 index=index + 1,
                 type="send",
                 inputs={"X": splited_vars},
-                outputs={},
+                outputs={"Out": dummy_output},
                 attrs={
                     "epmap": eplist,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                    OP_ROLE_VAR_ATTR_NAME:
+                    [self.grad_name_to_param_name[grad_varname], grad_varname],
+                    "sync_mode": not self.sync_mode,
                 })
             for _, var in enumerate(splited_vars):
                 send_vars.append(var)
@@ -268,7 +290,6 @@ class DistributeTranspiler(object):
                 outputs={},
                 attrs={
                     "endpoints": pserver_endpoints,
-                    "sync_mode": self.sync_mode,
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
@@ -284,19 +305,25 @@ class DistributeTranspiler(object):
             self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
 
         # step4: Concat the parameters splits together after recv.
-        for varname, splited_var in six.iteritems(self.param_var_mapping):
+        for param_varname, splited_var in six.iteritems(self.param_var_mapping):
             eps = []
             for var in splited_var:
                 index = [v.name for v in recv_vars].index(var.name)
                 eps.append(eplist[index])
-
+            grad_send_dummy_out = grad_name_to_send_dummy_out[
+                self.param_name_to_grad_name[param_varname]]
             program.global_block().append_op(
                 type="recv",
-                inputs={},
+                inputs={"X": [grad_send_dummy_out]},
                 outputs={"Out": splited_var},
                 attrs={
                     "epmap": eps,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                    OP_ROLE_VAR_ATTR_NAME: [
+                        param_varname,
+                        self.param_name_to_grad_name[param_varname]
+                    ],
+                    "sync_mode": not self.sync_mode
                 })
 
         if self.sync_mode:
@@ -309,10 +336,10 @@ class DistributeTranspiler(object):
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
-        for varname, splited_var in six.iteritems(self.param_var_mapping):
+        for param_varname, splited_var in six.iteritems(self.param_var_mapping):
             if len(splited_var) <= 1:
                 continue
-            orig_param = program.global_block().vars[varname]
+            orig_param = program.global_block().vars[param_varname]
             program.global_block().append_op(
                 type="concat",
                 inputs={"X": splited_var},
@@ -359,7 +386,7 @@ class DistributeTranspiler(object):
         # FIXME(gongwb): delete not need ops.
         # note that: some parameter is not trainable and those ops can't be deleted.
 
-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in six.iteritems(self.param_var_mapping):
             # Get the eplist of recv vars
             eps = []
             for var in splited_var:
@@ -380,7 +407,7 @@ class DistributeTranspiler(object):
 
             op = startup_program.global_block().append_op(
                 type="recv",
-                inputs={},
+                inputs={"X": []},
                 outputs={"Out": splited_var},
                 attrs={
                     "epmap": eps,
@@ -396,7 +423,7 @@ class DistributeTranspiler(object):
                 RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
             })
 
-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in six.iteritems(self.param_var_mapping):
             #add concat ops to merge splited parameters received from parameter servers.
             if len(splited_var) <= 1:
                 continue
@@ -580,6 +607,8 @@ class DistributeTranspiler(object):
             checkpoint_block_id = self._create_checkpoint_save_block(
                 pserver_program, table_opt_block.idx)
 
+            pserver_program._distributed_lookup_table = self.table_name
+
         # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
         # not be executed, so it's safe to use optimize_block to hold the place
         if self.has_distributed_lookup_table:
@@ -606,6 +635,10 @@ class DistributeTranspiler(object):
             outputs={},
             attrs=attrs)
 
+        # add distributed attrs
+        pserver_program._slice_vars_and_attrs = self._get_slice_vars_and_attrs(
+            endpoint)
+
         pserver_program._sync_with_cpp()
         return pserver_program
 
@@ -679,8 +712,31 @@ class DistributeTranspiler(object):
                     inputs=new_inputs,
                     outputs=new_outputs,
                     attrs=op.all_attrs())
+
+        # add slice vars
+        s_prog._slice_vars_and_attrs = self._get_slice_vars_and_attrs(endpoint)
+
         return s_prog
 
+    def _get_slice_vars_and_attrs(self, endpoint):
+        slice_vars_and_attrs = []
+        block_suffix = "block"
+        for param in self.param_grad_ep_mapping[endpoint]["params"]:
+            orig_var_name, block_name, _ = self._get_varname_parts(param.name)
+            if not block_name:
+                continue
+
+            block_idx = int(block_name.split(block_suffix)[1])
+            orig_var = self.origin_program.global_block().vars[orig_var_name]
+
+            skip_numel = 0
+            slice_vars = self.param_var_mapping[orig_var_name]
+            for slice_var in slice_vars[:block_idx]:
+                skip_numel += reduce(lambda x, y: x * y, slice_var.shape)
+            slice_vars_and_attrs.append([orig_var, skip_numel, param])
+
+        return slice_vars_and_attrs
+
     # ====================== private transpiler functions =====================
 
     def _has_distributed_lookup_table(self):
@@ -786,19 +842,21 @@ class DistributeTranspiler(object):
                                           self.config.min_block_size)
         assert (len(grad_blocks) == len(param_blocks))
 
-        # origin_varname -> [splited_var]
+        # origin_param_name -> [splited_param_vars]
         self.param_var_mapping = self._create_vars_from_blocklist(
             self.origin_program, param_blocks)
+        # origin_grad_name -> [splited_grad_vars]
         self.grad_var_mapping = self._create_vars_from_blocklist(
             self.origin_program,
             grad_blocks,
             add_trainer_suffix=self.trainer_num > 1)
+        # dict(grad_splited_var -> param_splited_var)
         self.grad_param_mapping = collections.OrderedDict()
         for g, p in zip(grad_blocks, param_blocks):
             g_name, g_bid, _ = g.split(":")
             p_name, p_bid, _ = p.split(":")
             self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
-                    self.param_var_mapping[p_name][int(p_bid)]
+                self.param_var_mapping[p_name][int(p_bid)]
 
         # create mapping of endpoint -> split var to create pserver side program
         self.param_grad_ep_mapping = collections.OrderedDict()
@@ -919,11 +977,15 @@ class DistributeTranspiler(object):
                     index=op_index + 2,
                     type="send",
                     inputs={'X': self.trainer_side_table_grad_list},
-                    outputs={},
+                    outputs={'Out': []},
                     attrs={
                         "sync_mode": True,
                         "epmap": pserver_endpoints,
-                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                        OP_ROLE_VAR_ATTR_NAME: [
+                            self.grad_name_to_param_name[table_grad_name],
+                            table_grad_name
+                        ]
                     })
                 break
 
@@ -1197,8 +1259,8 @@ class DistributeTranspiler(object):
         elif op_type == "momentum":
             if varkey == "Velocity":
                 return param_shape
-        elif op_type == "":
-            if varkey == "Moment":
+        elif op_type == "rmsprop":
+            if varkey in ["Moment", "MeanSquare"]:
                 return param_shape
         elif op_type == "sgd":
             pass
@@ -1277,8 +1339,6 @@ class DistributeTranspiler(object):
         pserver_block = program.global_block()
         new_inputs = collections.OrderedDict()
 
-        # update param/grad shape first, then other inputs like
-        # moment can use the updated shape
         def _get_param_block(opt_op):
             # param is already created on global program
             param_block = None
@@ -1291,22 +1351,6 @@ class DistributeTranspiler(object):
         for key in opt_op.input_names:
             if key == "Grad":
                 new_inputs[key] = merged_var
-            # For RMSProp optimizer
-            elif key == "Moment" or key == "MeanSquare":
-                param_block = _get_param_block(opt_op)
-                if not param_block:
-                    return
-                moment_var = origin_program.global_block().vars[opt_op.input(
-                    key)[0]]
-                tmpvar = pserver_block.create_var(
-                    name=moment_var.name,
-                    persistable=moment_var.persistable,
-                    dtype=moment_var.dtype,
-                    # change to use same shape as param
-                    # TODO(typhoonzero): didn't append .block in the var name,
-                    # may affect checkpoint saving? Need to verify.
-                    shape=param_block.shape)
-                new_inputs[key] = tmpvar
             elif key == "Param":
                 param_block = _get_param_block(opt_op)
                 if not param_block:
@@ -1334,7 +1378,7 @@ class DistributeTranspiler(object):
 
         for key in opt_op.input_names:
             new_shape = None
-            if key in ["Param", "Grad", "LearningRate", "Moment", "MeanSquare"]:
+            if key in ["Param", "Grad", "LearningRate"]:
                 continue
             var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
             # update accumulator variable shape
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index 42005839c44c1b3e8b910e9a3932181468659cae..f79fcb24bb5a48de00d03fc468b6526e48656d07 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -59,8 +59,12 @@ class InferenceTranspiler(object):
             scope = global_scope()
         if not isinstance(scope, core.Scope):
             raise TypeError("scope should be as Scope type or None")
-        self._fuse_batch_norm(program, place, scope)
-        self._fuse_relu_mkldnn(program)
+        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
+        if use_mkldnn:
+            self._fuse_relu_mkldnn(program)
+            self._fuse_conv_bias_mkldnn(program)
+        else:
+            self._fuse_batch_norm(program, place, scope)
 
     def _fuse_relu_mkldnn(self, program):
         '''
@@ -82,10 +86,6 @@ class InferenceTranspiler(object):
         :param program: program to transpile
         :type program: Program
         '''
-        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
-        if not use_mkldnn:
-            return
-
         self.block = program.block(0)
 
         i = 0
@@ -106,6 +106,69 @@ class InferenceTranspiler(object):
         # And a better solution will be considered later.
         program = program.clone()
 
+    def _fuse_conv_bias_mkldnn(self, program):
+        '''
+        Transpile the program by fused convolution and elementwise_add.
+
+        Replace conv2d and elementwise_add ops with a new conv2d op
+        based on an old conv2d op and the :math:`Bias` taken from
+        elementwise_add.
+
+        For input :math:`X`:
+
+        - Conv process:            :math:`X = input * W`
+        - Elementwise_add process: :math` X = X + bias`
+
+        After fuse into one operation:
+
+        .. math::
+
+            X = input * W + bias
+
+        The operator transformation is:
+
+        - before:
+
+          - conv->elementwise_add->any_other_op
+
+        - after:
+
+          - conv->any_other_op
+
+        The transpile stages are:
+
+        1. Extract bias and output variables from elementwise_add.
+        2. Extract Input, Weight and attributes from conv op.
+        3. Create a new convolution op based on extracted params.
+        4. Remove old conv op.
+        5. Remove elementwise_add.
+        5. Remove unused variables.
+
+        Args:
+            program (Program): program to transpile
+
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops) - 2:
+            current_op = self.block.ops[i]
+            next_op = self.block.ops[i + 1]
+            # conv2d with bias
+            if current_op.type in ['conv2d'] and \
+               next_op.type in ['elementwise_add']:
+                self._fuse_conv_bias(i, current_op, next_op)
+                self.block._remove_op(i + 1)  # Remove old conv
+                self.block._remove_op(i + 1)  # Remove elementwise_add
+                i = i + 1
+            i = i + 1
+
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
     def _fuse_batch_norm(self, program, place, scope):
         '''
         Transpile the program by fused batch normalization.
@@ -185,7 +248,6 @@ class InferenceTranspiler(object):
                         self.block._remove_op(i + 2)
                         i = i + 1
             i = i + 1
-
         self._adjust_input()
         self._remove_unused_var()
         # TODO(luotao): use clone() method to flush the program.desc in force,
@@ -288,6 +350,33 @@ class InferenceTranspiler(object):
         # collect the renamed input
         self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
 
+    def _fuse_conv_bias(self, index, conv_op, elementwise_add_op):
+        '''
+        fuse the conv op with elementwise_add
+
+        :param index: index of the conv_op in ops list
+        :type index: Int
+        :param conv_op: convolution operator
+        :type conv_op: Operator
+        :param elementwise_add_op: convolution's bias operator
+        :type elementwise_add_op: Operator
+        '''
+
+        bias_var = self.block.var(elementwise_add_op.input("Y")[0])
+        out_var = self.block.var(elementwise_add_op.output("Out")[0])
+        filter_var = self.block.var(conv_op.input("Filter")[0])
+        in_var = self.block.var(conv_op.input("Input")[0])
+        attrs = {name: conv_op.attr(name) for name in conv_op.attr_names}
+
+        self.block._insert_op(
+            index,
+            type="conv2d",
+            inputs={"Input": in_var,
+                    "Filter": filter_var,
+                    "Bias": bias_var},
+            outputs={"Output": out_var},
+            attrs=attrs)
+
     def _adjust_input(self):
         for i in range(len(self.block.ops)):
             current_op = self.block.ops[i]
diff --git a/python/setup.py.in b/python/setup.py.in
index 4a6cddbbea4903f5a65123aa19b7e978b335f32b..786c9f2e39880b68700b8acb94b3d35a48323958 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -159,18 +159,20 @@ if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_LIB}', libs_path)
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)
     package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
-if '${WITH_MKLDNN}' == 'ON':
-    # TODO(typhoonzero): use install_name_tool to patch mkl libs once
-    # we can support mkl on mac.
-    #
-    # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
-    # The reason is that all thirdparty libraries in the same directory,
-    # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
-    command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
-    if os.system(command) != 0:
-        raise Exception("patch libmkldnn.so failed, command: %s" % command)
-    package_data['paddle.libs']+=['libmkldnn.so.0']
-    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+if '${CMAKE_BUILD_TYPE}' == 'Release':
+    # only change rpath in Release mode.
+    if '${WITH_MKLDNN}' == 'ON':
+        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
+        # we can support mkl on mac.
+        #
+        # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
+        # The reason is that all thirdparty libraries in the same directory,
+        # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
+        command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
+        if os.system(command) != 0:
+            raise Exception("patch libmkldnn.so failed, command: %s" % command)
+        package_data['paddle.libs']+=['libmkldnn.so.0']
+        shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 # remove unused paddle/libs/__init__.py
 os.remove(libs_path+'/__init__.py')
 package_dir['paddle.libs']=libs_path
@@ -179,20 +181,22 @@ package_dir['paddle.libs']=libs_path
 # The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
 # core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
-if "@APPLE@" == "1":
-    command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-else:
-    command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-if os.system(command) != 0:
-    raise Exception("patch core.so failed, command: %s" % command)
-if '${WITH_FLUID_ONLY}'== 'OFF':
-    # change rpath of _swig_paddle.so.
+if '${CMAKE_BUILD_TYPE}' == 'Release':
+    # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed.
     if "@APPLE@" == "1":
-        command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+        command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
     else:
-        command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+        command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
     if os.system(command) != 0:
-        raise Exception("patch _swig_paddle.so failed, command: %s" % command)
+        raise Exception("patch core.so failed, command: %s" % command)
+    if '${WITH_FLUID_ONLY}'== 'OFF':
+        # change rpath of _swig_paddle.so.
+        if "@APPLE@" == "1":
+            command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+        else:
+            command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+        if os.system(command) != 0:
+            raise Exception("patch _swig_paddle.so failed, command: %s" % command)
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index 987fc01cfc13bcefbb9d792a6780fdfff158755d..0d59e4c110ff8502acb4dbcda15f855f7652a946 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 
-RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz freetype-devel libpng-devel graphviz
+RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh && \
   bash build_scripts/install_nccl2.sh && rm -r build_scripts
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
index d99d3db2ed7e3a719f044d8117baeb7ac212b74d..eb4b477dcb538f7ba17cfc54057a97c9669a6916 100644
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -28,7 +28,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
 PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
 
 # Libraries that are allowed as part of the manylinux1 profile
-MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel"
 
 # Get build utilities
 MY_DIR=$(dirname "${BASH_SOURCE[0]}")