diff --git a/.gitignore b/.gitignore
index 9e3a0b499f9f42856429f3a42bef313ea3df3699..b92bb9cc129659fa502b4a9b55548992412e5429 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
 python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
 python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 *.DS_Store
+*.vs
 build/
 build_doc/
 *.user
@@ -15,6 +16,7 @@ build_doc/
 .cproject
 .pydevproject
 .settings/
+CMakeSettings.json
 Makefile
 .test_env/
 third_party/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 920c20d6f813c14df8e02593b1c4a5a13cc11ef0..48e52961a95d50264b201eec50ccb3a462f39c54 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,22 +204,24 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 
-set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
+include(flags)              # set paddle compile flags
+include(cudnn)              # set cudnn libraries, must before configure
+include(cupti)
+include(configure)          # add paddle env configuration
+
 if(WITH_GPU)
     include(cuda)
     include(tensorrt)
     include(external/anakin)
+elseif()
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
 endif()
 
-include(cudnn)              # set cudnn libraries, must before configure
-include(cupti)
-include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(rdma)               # set rdma libraries
-include(flags)              # set paddle compile flags
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
 include(inference_lib)      # add paddle fluid inference libraries
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index c35096e09b5685ee30f20648e7dd461f71e0b1c4..e03e15bfc017ce33e06192a7fa8010ffe060adcb 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -50,7 +50,11 @@ if(NOT WITH_PROFILER)
 endif(NOT WITH_PROFILER)
 
 if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX_FOUND)
+    if(WITH_AVX AND AVX512F_FOUND)
+        set(SIMD_FLAG ${AVX512F_FLAG})
+    elseif(WITH_AVX AND AVX2_FOUND)
+        set(SIMD_FLAG ${AVX2_FLAG})
+    elseif(WITH_AVX AND AVX_FOUND)
         set(SIMD_FLAG ${AVX_FLAG})
     elseif(SSE3_FOUND)
         set(SIMD_FLAG ${SSE3_FLAG})
@@ -99,12 +103,21 @@ if(WITH_GPU)
     endif()
     if(WITH_ANAKIN)
         if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
-            message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile")
+            message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force WITH_ANAKIN=OFF")
+            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE)
         endif()
         if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-            message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
+            message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
+            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
         endif()
     endif()
+    if(WITH_ANAKIN)
+        # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
+        # is a softlink to real cudnn.h directory
+        set(ENV{CUDNN_INCLUDE_DIR} "${CUDNN_INCLUDE_DIR}/")
+        get_filename_component(CUDNN_LIBRARY_DIR ${CUDNN_LIBRARY} DIRECTORY)
+        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY_DIR})
+    endif()
 elseif(WITH_AMD_GPU)
     add_definitions(-DPADDLE_WITH_HIP)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 9eebea816cbfc91052c95ecf99ecc4b0bea4e4c2..cd51533926de7bb132ab7bfab1686d664a331410 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -25,8 +25,25 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
-    /usr/lib)
-find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
+    /usr/lib
+	${CUDA_TOOLKIT_ROOT_DIR}
+	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+	)
+set(CUDNN_LIB_NAME "")
+if (LINUX)
+set(CUDNN_LIB_NAME "libcudnn.so")
+endif(LINUX)
+
+if(WIN32)
+# only support cudnn7
+set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
+endif(WIN32)
+
+if(Apple)
+set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
+endif(Apple)
+
+find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
     PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
           NO_DEFAULT_PATH
     DOC "Path to cuDNN library.")
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index 403873a51017b9bb7c888bed77d89ca9b15b68d2..78be0749091fb0a617f9fb172cc92b33560a3552 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -2,6 +2,11 @@ if (NOT WITH_ANAKIN)
   return()
 endif()
 
+option(ANAKIN_ENABLE_OP_TIMER      "Get more detailed information with Anakin op time"        OFF)
+if(ANAKIN_ENABLE_OP_TIMER)
+  add_definitions(-DPADDLE_ANAKIN_ENABLE_OP_TIMER)
+endif()
+
 INCLUDE(ExternalProject)
 set(ANAKIN_SOURCE_DIR  ${THIRD_PARTY_PATH}/anakin)
 # the anakin install dir is only default one now
@@ -11,33 +16,45 @@ set(ANAKIN_LIBRARY     ${ANAKIN_INSTALL_DIR})
 set(ANAKIN_SHARED_LIB  ${ANAKIN_LIBRARY}/libanakin.so)
 set(ANAKIN_SABER_LIB   ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
 
-# TODO(luotao): ANAKIN_MODLE_URL will move to demo ci later.
-set(ANAKIN_MODLE_URL "http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2.anakin.bin")
+# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
+set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
+set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
+set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
+set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
 execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL}")
+execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
+execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
+execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
 
 include_directories(${ANAKIN_INCLUDE})
 include_directories(${ANAKIN_INCLUDE}/saber/)
+include_directories(${ANAKIN_INCLUDE}/saber/core/)
+include_directories(${ANAKIN_INCLUDE}/saber/funcs/impl/x86/)
+include_directories(${ANAKIN_INCLUDE}/saber/funcs/impl/cuda/base/cuda_c/)
 
-set(ANAKIN_COMPILE_EXTRA_FLAGS 
+set(ANAKIN_COMPILE_EXTRA_FLAGS
     -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
-    -Wno-error=unused-variable -Wno-unused-variable 
+    -Wno-error=unused-variable -Wno-unused-variable
     -Wno-error=format-extra-args -Wno-format-extra-args
     -Wno-error=comment -Wno-comment 
     -Wno-error=format -Wno-format 
+    -Wno-error=maybe-uninitialized -Wno-maybe-uninitialized
     -Wno-error=switch -Wno-switch
-    -Wno-error=return-type -Wno-return-type 
+    -Wno-error=return-type -Wno-return-type
     -Wno-error=non-virtual-dtor -Wno-non-virtual-dtor
+    -Wno-error=ignored-qualifiers
+    -Wno-ignored-qualifiers
     -Wno-sign-compare
-    -Wno-reorder 
+    -Wno-reorder
     -Wno-error=cpp)
 
 ExternalProject_Add(
     extern_anakin
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(luotao): use PaddlePaddle/Anakin later
+    DEPENDS             ${MKLML_PROJECT}
+    # Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code.
     GIT_REPOSITORY      "https://github.com/luotao1/Anakin"
-    GIT_TAG             "3957ae9263eaa0b1986758dac60a88852afb09be"
+    GIT_TAG             "211d1fc5d813d70c0c14072f9083cf25f40940ea"
     PREFIX              ${ANAKIN_SOURCE_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DUSE_GPU_PLACE=YES
@@ -46,6 +63,8 @@ ExternalProject_Add(
                         -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                         -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
                         -DCUDNN_ROOT=${CUDNN_ROOT}
+                        -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
+                        -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
                         ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
 )
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 1120677a37e0d44163816b66600121c8f0d545af..e0556a0babc74ba6efa0a190d4f7b77416bef3bf 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -102,7 +102,6 @@ set(COMMON_FLAGS
     -fno-omit-frame-pointer
     -Wall
     -Wextra
-    -Werror
     -Wnon-virtual-dtor
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
@@ -115,6 +114,11 @@ set(COMMON_FLAGS
     -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )
 
+# https://github.com/PaddlePaddle/Paddle/issues/12773
+if (NOT WIN32)
+list(APPEND COMMON_FLAGS -Werror)
+endif()
+
 set(GPU_COMMON_FLAGS
     -fPIC
     -fno-omit-frame-pointer
@@ -142,6 +146,11 @@ else()
         ${GPU_COMMON_FLAGS})
 endif()
 
+if(UNIX AND NOT APPLE)
+  # except apple from nix*Os family
+  set(LINUX TRUE)
+endif(UNIX AND NOT APPLE)
+
 
 foreach(flag ${COMMON_FLAGS})
     safe_set_cflag(CMAKE_C_FLAGS ${flag})
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 53c2de332ea74b06d1bd6e5bb119cad6af27ed01..3eacf4d86aa0385eddb690d72e85e3384929bb99 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID
     set(SSE3_FLAG "-msse3")
     set(AVX_FLAG "-mavx")
     set(AVX2_FLAG "-mavx2")
+    set(AVX512F_FLAG "-mavx512f")
 elseif(MSVC)
     set(MMX_FLAG "/arch:MMX")
     set(SSE2_FLAG "/arch:SSE2")
@@ -81,5 +82,16 @@ int main()
     return 0;
 }" AVX2_FOUND)
 
+# Check AVX512F
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m512i a = _mm512_undefined_epi32();
+    return 0;
+}" AVX512F_FOUND)
+
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
+mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/doc/fluid/design/others/graph_survey.md b/doc/fluid/design/others/graph_survey.md
index 6c6db08f463ae0a2b94fc4546f123a1d7c151870..97f395133b48a1d0ed5136f0ebc8720b8ca87ded 100644
--- a/doc/fluid/design/others/graph_survey.md
+++ b/doc/fluid/design/others/graph_survey.md
@@ -28,7 +28,7 @@ def get_symbol(num_classes=10, **kwargs):
 
 
 
-Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null.
+Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null.
 
 Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
 
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 587d819f79fcf82549826359fbf04ad3af404446..c00f73be955e0fb54bb01ffa9a61b3f27c112f75 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -119,10 +119,29 @@ $$Out = scale*X$$
 
 这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 
+### 定义GradProtoMaker类
+每个Op的必须有一个对应的GraProtoMaker，若未定制对应前向Op的GradProtoMaker，fluid提供了DefaultGradProtoMaker，默认注册会使用全部输入输出，包括Input, Output, Output@Grad等，使用不需要的变量的会造成显存浪费。
+下面示例定义了ScaleOp的GradProtoMaker。
+
+```cpp
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", GetAttr("scale"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+```
 
 ### 定义Operator类
 
-下面的点实现了MulOp的定义：
+下面实现了MulOp的定义：
 
 ```cpp
 class MulOp : public framework::OperatorWithKernel {
@@ -334,3 +353,83 @@ ctest -R test_mul_op
 - 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OPERATOR(B, ...)`等，这将会导致单元测试出错。
 - 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+### PADDLE_ENFORCE使用注意
+
+实现Op时检查数据的合法性需要使用PADDLE_ENFORCE以及PADDLE_ENFORCE_EQ等宏定义，基本格式如下：
+
+```
+PADDLE_ENFORCE(表达式, 错误提示信息)
+PADDLE_ENFORCE_EQ(比较对象A, 比较对象B, 错误提示信息)
+```
+
+如果表达式为真，或者比较对象A=B，则检查通过，否则会终止程序运行，向用户反馈相应的错误提示信息。
+为了确保提示友好易懂，开发者需要注意其使用方法。
+
+#### 总体原则
+
+任何使用了PADDLE_ENFORCE与PADDLE_ENFORCE_**检查的地方，必须有详略得当的备注解释！**错误提示信息**不能为空！
+
+#### 提示信息书写标准
+
+1. [required] 哪里错了？为什么错了？
+    - 例如：`ValueError: Mismatched label shape`
+2. [optional] 期望的输入是什么样的？实际的输入是怎样的？
+    - 例如：`Expected labels dimension=1. Received 4.`
+3. [optional] 能否给出修改意见？
+    - 例如：`Suggested Fix:If your classifier expects one-hot encoding label,check your n_classes argument to the estimatorand/or the shape of your label.Otherwise, check the shape of your label.`
+
+如果并非必要或者简洁的描述即可表达清楚以上要点，根据情况书写亦可。
+
+##### FAQ 典型问题
+
+1. 无报错信息或报错信息过于简单，不能给用户提供有效的提示！
+
+问题示例1 ：未写提示信息
+```
+PADDLE_ENFORCE(ctx->HasInput("X"), "");
+```
+问题示例2 ：提示信息过于简单
+```
+PADDLE_ENFORCE(i != nullptr, "I must be set"); // I是什么？
+```
+
+2. 在报错信息中使用开发人员定义的变量缩写，不易理解！
+
+问题示例：
+```
+PADDLE_ENFORCE(forward_pd != nullptr,
+                    "Fail to find eltwise_fwd_pd in device context");  //eltwise_fwd_pd用户可能看不懂
+```
+
+3. OP内部调用非法接口：Op内部如果出现Output = ShareDataWith(Input) 
+问题示例：
+```cpp
+auto *out = ctx.Output<framework::LoDTensor>("Out");
+auto *in = ctx.Input<framework::LoDTensor>("X");
+out->ShareDataWith(*in);
+```
+Op内部如果出现Output = ShareDataWith(Input)，相当于operator图的中有一条隐藏边，连接了Input和Output，这条边无法在图分析中表达，引发基于图优化的错误。
+
+4. OP实现的性能实践
+调用了eigen的broadcast, chop等操作，性能会比手写cuda kernel差几倍以上。此时cpu的实现可以复用eigen，gpu实现可以实现cuda kernel.
+
+
+#### OP InferShape检查提示信息特别说明
+
+- 检查输入输出变量，请统一遵循以下格式
+`Input(变量名) of OP名 operator should not be null.`  
+
+正确示例：
+```
+PADDLE_ENFORCE(ctx->HasInput("Input"),
+                        "Input(Input) of LSTMP operator should not be null.");
+```
+
+- 反向Op的输入输出检查，要写明反向Op的名字
+
+正确示例：
+```
+PADDLE_ENFORCE(ctx->HasInput("X"),
+                        "Input(X) of LoDResetGrad opreator should not be null.");
+```
diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md
index 75922e7d85a13e53ce94619a48d8da8b960e6c9a..56203d6fad444f61ef1be187ad0d149b2aa99ba4 100644
--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
@@ -7,7 +7,7 @@
 
 Eigen Tensor模块对element-wise计算提供了强大的支持，并且书写一份代码，可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块，因此可能测试不够完备，文档较少。
 
-关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
+关于Eigen Tensor模块的详细介绍请参考[Eigen文档](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
 
 
 ## paddle::framework::Tensor
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index e963902a50200b785284e8f233fcca1abf459140..9250cde1b2bc8fa1e14c0ba1ea9b509c496fc506 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -78,7 +78,7 @@ paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', '
 paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
-paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
 paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
@@ -153,6 +153,7 @@ paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'n
 paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
 paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -250,7 +251,6 @@ paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwarg
 paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 1d62792b80dd002b894da28be9162fc7d3ce054e..2ec422cc17faf7f6b99ac70b5f175881bf017566 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -99,8 +99,13 @@ else()
   cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
 endif()
 
-
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass)
+if (NOT WIN32)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS
+        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
+        graph graph_viz_pass multi_devices_graph_pass
+        multi_devices_graph_print_pass multi_devices_graph_check_pass
+        fast_threaded_ssa_graph_executor)
+endif() # NOT WIN32
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
@@ -115,6 +120,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 
+cc_test(rw_lock_test SRCS rw_lock_test.cc)
+
 # disable test temporarily.
 # TODO https://github.com/PaddlePaddle/Paddle/issues/11971
 # cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index 8428bf8e3392f68c9d1e2553f4d017cb620bb9f3..14ca3e96209ed17f12e87fda8506806514698977 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -128,7 +128,8 @@ struct ExtractAttribute {
       attr_value = &boost::get<T>(attr);
     } catch (boost::bad_get& bad_get) {
       PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
-                   attr_name_, typeid(T).name(), attr.type().name());
+                   attr_name_, paddle::platform::demangle(typeid(T).name()),
+                   paddle::platform::demangle(attr.type().name()));
     }
     return attr_value;
   }
@@ -160,7 +161,7 @@ struct ExtractAttribute<bool> {
       attr_value = &boost::get<bool>(attr);
     } catch (boost::bad_get& bad_get) {
       PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
-                   attr_name_, attr.type().name());
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
     }
     return attr_value;
   }
@@ -186,7 +187,7 @@ struct ExtractAttribute<int64_t> {
       attr_value = &boost::get<int64_t>(attr);
     } catch (boost::bad_get& bad_get) {
       PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
-                   attr_name_, attr.type().name());
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
     }
     return attr_value;
   }
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 8f6c4163d6ee11fbe83f603f6148c2ac6175324d..abd5459f6d47da6d1341284916b419325dc5977c 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -42,3 +42,5 @@ cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_b
 cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
+cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
+        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 716d674fa29bad9321fc20979775c06f26bf4679..5183be878eb49cccc68603c3fdd8023be5578036 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -19,10 +19,13 @@ namespace framework {
 namespace details {
 
 struct ExecutionStrategy {
+  enum ExecutorType { kDefault = 0, kExperimental = 1 };
+
   size_t num_threads_{0};
   bool use_cuda_{true};
   bool allow_op_delay_{false};
   size_t num_iteration_per_drop_scope_{100};
+  ExecutorType type_{kDefault};
 };
 
 }  //  namespace details
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7606f2bc06b2ecf07c5649eeae1a2d5587a8880c
--- /dev/null
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> &&graph)
+    : strategy_(strategy),
+      local_scopes_(local_scopes),
+      places_(places),
+      graph_(std::move(graph)),
+      pool_(strategy.num_threads_ +
+            1),  // add one more thread for generate op_deps
+      fetch_ctxs_(places) {
+  auto &ops = graph_->Get<details::GraphOps>("ops");
+
+  for (auto &op : ops) {
+    int dep = static_cast<int>(op->NotReadyInputSize());
+    op_deps_.emplace(op.get(), dep);
+    if (dep == 0) {
+      bootstrap_ops_.emplace_back(op.get());
+    }
+  }
+
+  PrepareAtomicOpDeps();
+}
+
+FeedFetchList FastThreadedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>
+      op_deps = atomic_op_deps_.get();
+  PrepareAtomicOpDeps();
+
+  paddle::framework::FeedFetchList fetches;
+  fetches.resize(fetch_tensors.size());
+  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
+  std::vector<std::unique_ptr<ir::Node>> fetch_nodes;
+  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+
+  for (auto &fetch_var_name : fetch_tensors) {
+    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+      auto it = var_map.find(fetch_var_name);
+      if (it != var_map.end()) {
+        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
+      }
+    }
+  }
+
+  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
+    auto &var_name = fetch_tensors[i];
+    auto fetched_var_it = fetched_vars.find(var_name);
+    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
+                   "Cannot find fetched variable.(Perhaps the main_program "
+                   "is not set to ParallelExecutor)");
+
+    auto &vars = fetched_var_it->second;
+
+    fetch_nodes.emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *op = new FetchOpHandle(fetch_nodes.back().get(), &fetches, i,
+                                 &local_scopes_);
+    fetch_ops.emplace_back(op);
+
+    for (auto &p : places_) {
+      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
+    }
+
+    for (auto *var : vars) {
+      op->AddInput(var);
+    }
+
+    (*op_deps)[op] = static_cast<int>(op->NotReadyInputSize());
+  }
+
+  size_t num_complete = 0;
+  remaining_ = 0;
+  BlockingQueue<size_t> complete_q;
+  for (auto op : bootstrap_ops_) {
+    RunOpAsync(op_deps.get(), op, &complete_q);
+  }
+
+  while (num_complete != op_deps->size()) {
+    size_t num_comp = complete_q.Pop();
+    if (num_comp == -1UL) {
+      int remaining = 0;
+      while (true) {
+        remaining = remaining_;
+        if (remaining == 0) {
+          break;
+        }
+        for (int i = 0; i < remaining; ++i) {
+          complete_q.Pop();
+        }
+      }
+      exception_.ReThrow();
+    }
+    num_complete += num_comp;
+  }
+  // Wait FetchOps.
+  if (!fetch_ops.empty()) {
+    fetch_ops.clear();
+  }
+  return fetches;
+}
+void FastThreadedSSAGraphExecutor::RunOpAsync(
+    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
+    OpHandleBase *op, BlockingQueue<size_t> *complete_q) {
+  ++remaining_;
+  this->pool_.enqueue([=] {
+    OpHandleBase *op_to_run = op;
+    size_t complete = 0;
+    while (op_to_run != nullptr) {
+      try {
+        op_to_run->Run(strategy_.use_cuda_);
+        ++complete;
+      } catch (...) {
+        exception_.Catch(std::current_exception());
+        --remaining_;
+        complete_q->Push(-1UL);
+        return;
+      }
+      auto &outputs = op_to_run->Outputs();
+      op_to_run = nullptr;
+      for (auto &output : outputs) {
+        for (auto &pending_op : output->PendingOps()) {
+          std::atomic<int> &deps = op_deps->at(pending_op);
+          if (deps.fetch_sub(1) == 1) {  // pending_op ready
+            if (op_to_run == nullptr) {
+              op_to_run = pending_op;
+            } else {
+              this->RunOpAsync(op_deps, pending_op, complete_q);
+            }
+          }
+        }
+      }
+    }
+    --remaining_;
+    complete_q->Push(complete);
+  });
+}
+void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
+  atomic_op_deps_ = pool_.enqueue([&] {
+    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps =
+        new std::unordered_map<OpHandleBase *, std::atomic<int>>;
+    for (auto &pair : op_deps_) {
+      (*op_deps)[pair.first] = pair.second;
+    }
+    return std::unique_ptr<
+        std::unordered_map<OpHandleBase *, std::atomic<int>>>(op_deps);
+  });
+}
+
+const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..dad3a231cba6402f57ba654a9ac5fb520b9c8f04
--- /dev/null
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/exception_holder.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+
+class OpHandleBase;
+class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
+                               const std::vector<Scope *> &local_scopes,
+                               const std::vector<platform::Place> &places,
+                               std::unique_ptr<ir::Graph> &&graph);
+  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
+  const ir::Graph &Graph() const override;
+
+ private:
+  ExecutionStrategy strategy_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+  std::unique_ptr<ir::Graph> graph_;
+
+  std::unordered_map<OpHandleBase *, int> op_deps_;
+  std::vector<OpHandleBase *> bootstrap_ops_;
+
+  ::ThreadPool pool_;
+  platform::DeviceContextPool fetch_ctxs_;
+  std::atomic<int> remaining_;
+
+  void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
+                  OpHandleBase *op, BlockingQueue<size_t> *complete_q);
+
+  void PrepareAtomicOpDeps();
+
+  std::future<
+      std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
+      atomic_op_deps_;
+  ExceptionHolder exception_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index ee9f9184da65467b82794c99fe3e95b108373753..3812f0abf1b7069525c4420054c61c01c908acfe 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -158,6 +158,16 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
 #endif
 }
 
+size_t OpHandleBase::NotReadyInputSize() const {
+  std::unordered_set<VarHandleBase *> res;
+  for (auto *var : inputs_) {
+    if (var->GeneratedOp() != nullptr) {
+      res.emplace(var);
+    }
+  }
+  return res.size();
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 2d7f18942890245249dd0619a40bb43833c9a2ee..9fbefabc841e3f6940860f60d959fee97495e4c9 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -81,6 +81,8 @@ class OpHandleBase {
     return res.size();
   }
 
+  size_t NotReadyInputSize() const;
+
   const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
 
   size_t NoDummyInputSize() const;
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 923a7083d4f30b646bbab03d79992b275aa2b403..da0955a9a000e0d0bff3fe9d0bc3bd25171be3d2 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -5,8 +5,12 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)
+cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detecter)
+cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass)
+
 
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
+cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detecter graph pass graph_traits framework_proto)
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4327742eac843f27385c165216ce48ceb97ea71
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -0,0 +1,192 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+bool VarOutLinksToOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->outputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void BuildFCPattern(PDPattern* pattern) {
+  // make sure the selected MUL op has one input argument is a parameter.
+  auto* mul_parameter_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() && node->outputs.size() == 1UL &&
+               node->outputs.front()->Op()->Type() == "mul" && node->Var() &&
+               node->Var()->Persistable();  // check is a parameter
+      },
+      "mul_weight" /*name*/);
+
+  auto* mul_tmp_input_var = pattern->NewNode(
+      [](Node* node) {
+        bool result =
+            node->IsVar() && node->outputs.size() >= 1UL && node->Var() &&
+            !node->Var()->Persistable();  // this input is not an parameter.
+        if (!result) return false;
+        // check whether one output is MUL op.
+        for (auto* op : node->outputs) {
+          if (op->IsOp() && op->Op()->Type() == "mul") return true;
+        }
+        return false;
+      },
+      "mul_tmp_var" /*name*/);
+
+  // select a MUL op
+  auto* mul_op = pattern->NewNode(
+      [](Node* node) {
+        return node->IsOp() &&               // start from an Op
+               node->Op()->Type() == "mul";  // type is mul
+        // the output should be consumed only by one element_add, that check
+        // leaves in a Var PDNode.
+      },
+      "mul" /*name*/);
+
+  // make sure the MUL op's output has only one consumer and links to an
+  // ELEMENTWISE_ADD op.
+  auto* mul_out_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() &&                  // starts from a Var
+               node->outputs.size() == 1UL &&    // only has one consumer
+               node->outputs.front()->IsOp() &&  // check basic logic
+               node->Var() &&                    // not a ControlDepVar
+               node->outputs.front()->Op()->Type() ==
+                   "elementwise_add";  // a very strong validation
+      },
+      "mul_out");
+  // this check is not essential, just to make the corresponding variable Node
+  // retrival easier.
+  auto* elementwise_add_tmp_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() && node->outputs.size() >= 1UL && node->Var() &&
+               VarOutLinksToOp(node, "elementwise_add");
+      },
+      "elementwise_add_tmpvar");
+
+  // select an ELEMENTWISE_ADD op
+  auto* elementwise_add_op = pattern->NewNode(
+      [](Node* node) {
+        return node->IsOp() && node->Op()->Type() == "elementwise_add";
+      },
+      "elementwise_add" /*name*/);
+
+  // get the ELEMENTWISE_ADD op's output
+  auto* elementwise_add_out_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() && node->inputs.size() == 1UL && node->Var() &&
+               node->inputs.front()->Op()->Type() == "elementwise_add";
+      },
+      "elementwise_add_out");
+
+  pattern->AddEdge(mul_parameter_var, mul_op);
+  pattern->AddEdge(mul_tmp_input_var, mul_op);
+  pattern->AddEdge(mul_op, mul_out_var);
+  pattern->AddEdge(mul_out_var, elementwise_add_op);
+  pattern->AddEdge(elementwise_add_tmp_var, elementwise_add_op);
+  pattern->AddEdge(elementwise_add_op, elementwise_add_out_var);
+}
+
+// Replace the node `from` in the links to `to`
+bool LinksReplace(std::vector<Node*>* links, Node* from, Node* to) {
+  for (auto*& n : *links) {
+    if (n == from) {
+      n = to;
+      return true;
+    }
+  }
+  return false;
+}
+
+std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+
+  std::unordered_set<Node*> nodes2delete;
+
+  GraphPatternDetecter gpd;
+  BuildFCPattern(gpd.mutable_pattern());
+
+#define GET_NODE(id)                                             \
+  PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetriveNode(#id)), \
+                 "pattern has no Node called %s", #id);          \
+  auto* id = subgraph.at(gpd.pattern().RetriveNode(#id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+
+  auto handler = [&](const GraphPatternDetecter::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle FC fuse";
+    // Currently, there is no FC op available, so I will just simulate the
+    // scenerio.
+    // FC's fusion is simple, just op fuse, no need to process the
+    // parameters.
+    GET_NODE(mul_tmp_var);             // x
+    GET_NODE(mul_weight);              // Y
+    GET_NODE(elementwise_add_tmpvar);  // bias
+    GET_NODE(elementwise_add_out);     // Out
+    GET_NODE(mul);                     // MUL op
+    GET_NODE(elementwise_add);         // ELEMENT_ADD op
+    GET_NODE(mul_out);                 // tmp
+#undef GET_NODE
+
+    // Create an FC Node.
+    OpDesc desc;
+    std::string fc_x_in = mul_tmp_var->Name();
+    std::string fc_Y_in = mul_weight->Name();
+    std::string fc_bias_in = elementwise_add_tmpvar->Name();
+    std::string fc_out = elementwise_add_out->Name();
+    desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
+    desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
+    desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
+    desc.SetOutput("Out", std::vector<std::string>({fc_out}));
+    desc.SetType("fc");
+    auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
+    fc_node->inputs =
+        std::vector<Node*>({mul_tmp_var, mul_weight, elementwise_add_tmpvar});
+    fc_node->outputs.push_back(elementwise_add_out);
+
+    // Update link relatons
+    PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node));
+    PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node));
+    PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs,
+                                elementwise_add, fc_node));
+    PADDLE_ENFORCE(
+        LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node));
+
+    // Drop old nodes
+    graph->RemoveNode(mul);
+    graph->RemoveNode(elementwise_add);
+    graph->RemoveNode(mul_out);  // tmp variable
+  };
+
+  gpd(graph.get(), handler);
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass);
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/framework/ir/fc_fuse_pass.h
similarity index 50%
rename from paddle/fluid/operators/fill_constant_op.cu.cc
rename to paddle/fluid/framework/ir/fc_fuse_pass.h
index 51ccaefa4338dfa18d26441a59d5fed2b9fa0c39..eb43dd4486cda578804fb9f6438c67e9e4a03091 100644
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -12,15 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+#include "paddle/fluid/framework/ir/pass.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>)
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the MUL and ELEMENTWISE_ADD to a FCOp.
+ */
+class FCFusePass : public Pass {
+ public:
+  virtual ~FCFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..87ba417b1a43475f48380009f8e5cd84699b8e40
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetInput("Xs", inputs);
+  op->SetOutput("Ys", outputs);
+}
+
+// a->OP0->b
+// a->OP1->c
+// (b, c)->mul->d
+// (d, e)->elementwise_add->f
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "c") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
+        std::vector<std::string>({"d"}));
+  SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
+        std::vector<std::string>({"f"}));
+
+  return prog;
+}
+
+TEST(FCFusePass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("fc_fuse_pass");
+
+  int pre_nodes = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int after_nodes = graph->Nodes().size();
+
+  // Remove 3 Nodes: MUL,ELEMENTWISE_ADD, mul_out
+  // Add 1 Node: FC
+  EXPECT_EQ(pre_nodes - 2, after_nodes);
+
+  // Assert fc op in newly generated graph
+  int fc_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "fc") {
+      ++fc_count;
+    }
+  }
+  EXPECT_EQ(fc_count, 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index f87d5212c0cd87a5a63cf2d54ca677516ab45816..2a6bf4ac230df81b38751000bf4b663f24984db3 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -117,7 +117,15 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
     }
     // For output args, always create a new var.
     for (auto &each_var_name : op->OutputArgumentNames()) {
-      ir::Node *var = CreateVarNode(all_vars.at(each_var_name));
+      ir::Node *var = nullptr;
+      if (all_vars.count(each_var_name) != 0) {
+        var = CreateVarNode(all_vars.at(each_var_name));
+      } else {
+        // Operation output vars can be @EMPTY@. For example, while_grad
+        // can have multi @EMPTY@ outputs with no VarDesc.
+        // TODO(panyx0718): Add a test.
+        var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
+      }
       var_nodes[each_var_name].push_back(var);
       node->outputs.push_back(var);
       var->inputs.push_back(node);
@@ -208,7 +216,8 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
       // Add write after write dependence
       ir::Node *upstream_op =
           (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
-      if (upstream_op) {
+      // TODO(zcd): Add a test.
+      if (upstream_op && upstream_op != write_op) {
         ir::Node *dep_var = CreateControlDepVar();
         write_op->inputs.push_back(dep_var);
         upstream_op->outputs.push_back(dep_var);
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 5736a5c4e232698085936303d1f23760649f8245..25e33861c06c9fcd2625e3a4036a04508acbd2ca 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -98,11 +98,13 @@ class Graph {
 
   // Create a normal variable with non-null VarDesc.
   ir::Node *CreateVarNode(VarDesc *var_desc) {
+    PADDLE_ENFORCE(var_desc);
     return AddNode(new ir::Node(var_desc));
   }
 
   // Create a normal runnable operator with OpDesc.
   ir::Node *CreateOpNode(OpDesc *op_desc) {
+    PADDLE_ENFORCE(op_desc);
     return AddNode(new ir::Node(op_desc));
   }
 
@@ -134,6 +136,14 @@ class Graph {
     return ret;
   }
 
+  void RemoveNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
+    node_set_.erase(node);
+    nodes_.erase(node);
+  }
+
+  const ProgramDesc &program() const { return program_; }
+
  private:
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
@@ -143,12 +153,6 @@ class Graph {
     return node;
   }
 
-  void RemoveNode(ir::Node *node) {
-    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
-    node_set_.erase(node);
-    nodes_.erase(node);
-  }
-
   // NOTE: program_ shouldn't be exposed to user.
   const ProgramDesc &program_;
   std::map<std::string, boost::any> attrs_;
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index b1c19e6535150130822e9f48685241e62de5b064..dc81a2cac585b50b81f79f8f204ce1145d93eab0 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -104,7 +104,7 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
       for (auto &adj_n : var->inputs) {
         PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
         adj_list[n].insert(adj_n);
-        VLOG(3) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
       }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.cc b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
index f27d9b0509aa4561cfd1e5da3b46a3a085cc888c..e197861251fe5c9f98eaaba2a10b4af371dcbcba 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detecter.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <array>
 #include <string>
 #include <vector>
 
@@ -24,12 +25,30 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+size_t PDPattern::id_ = 0UL;
+
 PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
+  if (!name.empty()) {
+    PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
+                      "PDNode's name should be unique, get duplicate [%s]",
+                      name);
+  }
+
   nodes_.emplace_back(new PDNode(std::move(teller), name));
   auto* cur = nodes_.back().get();
+  node_map_[name] = cur;
   return cur;
 }
 
+PDNode* PDPattern::RetriveNode(const std::string& id) const {
+  auto it = node_map_.find(id);
+  if (it == node_map_.end()) {
+    return nullptr;
+  }
+
+  return it->second;
+}
+
 void PDPattern::AddEdge(PDNode* a, PDNode* b) {
   PADDLE_ENFORCE(a);
   PADDLE_ENFORCE(b);
@@ -50,15 +69,18 @@ void GraphPatternDetecter::operator()(Graph* graph,
 }
 
 bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) {
+  VLOG(4) << "mark pdnodes in graph";
   if (graph.Nodes().empty()) return false;
 
   for (auto& node : GraphTraits::DFS(graph)) {
     for (const auto& pdnode : pattern_.nodes()) {
       if (pdnode->Tell(&node)) {
+        VLOG(4) << "pdnode " << pdnode->name() << " marked";
         pdnodes2nodes_[pdnode.get()].insert(&node);
       }
     }
   }
+  VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
   return !pdnodes2nodes_.empty();
 }
 
@@ -66,10 +88,20 @@ struct HitGroup {
   std::unordered_map<PDNode*, Node*> roles;
 
   bool Match(Node* node, PDNode* pat) {
+    if (nodes_.count(node)) {
+      if (!roles.count(pat)) return false;
+      return roles[pat] == node;
+    }
     return !roles.count(pat) || roles.at(pat) == node;
   }
 
-  void Register(Node* node, PDNode* pat) { roles[pat] = node; }
+  void Register(Node* node, PDNode* pat) {
+    roles[pat] = node;
+    nodes_.insert(node);
+  }
+
+ private:
+  std::unordered_set<Node*> nodes_;
 };
 
 // Tell whether Node a links to b.
@@ -103,6 +135,7 @@ GraphPatternDetecter::DetectPatterns() {
   // Extend a PDNode to subgraphs by deducing the connection relations defined
   // in edges of PDNodes.
   for (const auto& edge : pattern_.edges()) {
+    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
     // Each role has two PDNodes, which indicates two roles.
     // Detect two Nodes that can match these two roles and they are connected.
     auto& pre_groups = bi_records[step % 2];
@@ -126,6 +159,7 @@ GraphPatternDetecter::DetectPatterns() {
         }
       }
     }
+    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
   }
 
   for (auto& group : bi_records[step % 2]) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.h b/paddle/fluid/framework/ir/graph_pattern_detecter.h
index 1778bf00000f60e5cf8b2a585bf7e5dae0a582eb..68c39902b5a79bf25ca7f08529a958274ac64e33 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detecter.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.h
@@ -96,7 +96,8 @@ class PDPattern {
 
   void AddEdge(PDNode* a, PDNode* b);
 
-  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = "");
+  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
+  PDNode* RetriveNode(const std::string& id) const;
 
   const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
   const std::vector<edge_t>& edges() const { return edges_; }
@@ -107,8 +108,12 @@ class PDPattern {
   FRIEND_TEST(PDPattern, NewNode);
 #endif
 
+  static std::string NewID() { return "pdnode-" + std::to_string(id_++); }
+
   std::vector<std::unique_ptr<PDNode>> nodes_;
   std::vector<edge_t> edges_;
+  std::unordered_map<std::string, PDNode*> node_map_;
+  static size_t id_;
 };
 
 /*
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index b1b8d1c586c98a327a8e5b4890ced00022155e6b..cadda49c399a6d65079cacedfea61f4fd580a69a 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -200,9 +200,11 @@ TEST(GraphTest, WriteAfterWrite) {
       ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
       control_dep2 = n->inputs[1];
       ASSERT_EQ(n->inputs.size(), 2);
-      ASSERT_EQ(control_dep1, control_dep2);
     }
   }
+  ASSERT_NE(control_dep1, nullptr);
+  ASSERT_NE(control_dep2, nullptr);
+  ASSERT_EQ(control_dep1, control_dep2);
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h
index edbe45acb98326ee3bf1d86495832ec8469b634e..f42bab20ed97e372d2da0c4a492a4458ab94e0a0 100644
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
@@ -12,7 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
+
 #include <stack>
+#include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 8cb812d1388bf74d173a4dc7a99561e730f8e95a..e7ff0c1dac134334e3baad88886862ebff0fe367 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -25,6 +25,7 @@ static const char kGraphVizPath[] = "graph_viz_path";
 std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
+  VLOG(3) << "draw IR graph viz to " << graph_viz_path;
   std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
   PADDLE_ENFORCE(fout->good());
   std::ostream& sout = *fout;
diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f885567da1965b997b2063e06c839af95b43e1e1
--- /dev/null
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferCleanGraphPass : public Pass {
+ public:
+  virtual ~InferCleanGraphPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
+    PADDLE_ENFORCE(graph.get());
+
+    auto is_valid_node = [](Node* x) {
+      return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
+    };
+
+    std::unordered_set<Node*> invalid_nodes;
+    for (auto* node : graph->Nodes()) {
+      if (is_valid_node(node)) {
+        invalid_nodes.insert(node);
+      }
+    }
+
+    // remove nodes from the graph.
+    for (auto* node : invalid_nodes) {
+      graph->RemoveNode(node);
+    }
+
+    // clean edges.
+    for (auto* node : graph->Nodes()) {
+      CleanEdges(&node->inputs, invalid_nodes);
+      CleanEdges(&node->outputs, invalid_nodes);
+    }
+
+    return graph;
+  }
+
+  void CleanEdges(std::vector<Node*>* nodes,
+                  const std::unordered_set<Node*>& to_remove) const {
+    auto it = std::remove_if(nodes->begin(), nodes->end(),
+                             [&](Node* x) { return to_remove.count(x); });
+    nodes->erase(it, nodes->end());
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(infer_clean_graph_pass,
+              paddle::framework::ir::InferCleanGraphPass);
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 9c0765ab8ce16733ac021aefc8c7b2bb779319f3..063c70fb7b9c0f9b90d872a70f362459ef149391 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -34,14 +34,15 @@ class Node {
 
   explicit Node(VarDesc* var_desc)
       : name_(var_desc->Name()),
-        var_desc_(var_desc),
+        var_desc_(new VarDesc(*var_desc)),
         op_desc_(nullptr),
         type_(Type::kVariable) {}
 
   explicit Node(OpDesc* op_desc)
       : name_(op_desc->Type()),
         var_desc_(nullptr),
-        op_desc_(op_desc),
+        op_desc_(new OpDesc(*op_desc)),  // TODO(panyx0718) the pointer in the
+                                         // original OpDesc might go out.
         type_(Type::kOperation) {}
 
   Type NodeType() const { return type_; }
@@ -50,12 +51,12 @@ class Node {
 
   VarDesc* Var() {
     PADDLE_ENFORCE(type_ == Type::kVariable);
-    return var_desc_;
+    return var_desc_.get();
   }
 
   OpDesc* Op() {
-    PADDLE_ENFORCE(type_ == Type::kOperation);
-    return op_desc_;
+    PADDLE_ENFORCE(IsOp());
+    return op_desc_.get();
   }
 
   bool IsOp() const { return type_ == Type::kOperation; }
@@ -66,8 +67,8 @@ class Node {
 
  protected:
   const std::string name_;
-  VarDesc* var_desc_;
-  OpDesc* op_desc_;
+  std::unique_ptr<VarDesc> var_desc_;
+  std::unique_ptr<OpDesc> op_desc_;
   Type type_;
 
  private:
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 03f7e71c03b8dd75d2a47cb4c6d1ef1a71792cf3..122dc161b41246e5f08bd0ae8b763489e9ee22f9 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -202,6 +202,52 @@ std::vector<std::string> OpDesc::AttrNames() const {
 }
 
 void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
+  // NOTICE(minqiyang): pybind11 will take the empty list in python as
+  // the std::vector<int> type in C++; so we have to change the attr's type
+  // here if we meet this issue
+  proto::AttrType attr_type = static_cast<proto::AttrType>(v.which() - 1);
+  if (attr_type == proto::AttrType::INTS &&
+      boost::get<std::vector<int>>(v).size() == 0u) {
+    // Find current attr via attr name and set the correct attribute value
+    const proto::OpProto::Attr &attr = GetProtoAttr(name);
+    switch (attr.type()) {
+      case proto::AttrType::BOOLEANS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BOOLEANS";
+        this->attrs_[name] = std::vector<bool>();
+        break;
+      }
+      case proto::AttrType::INTS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to INTS";
+        this->attrs_[name] = std::vector<int>();
+        break;
+      }
+      case proto::AttrType::FLOATS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to FLOATS";
+        this->attrs_[name] = std::vector<float>();
+        break;
+      }
+      case proto::AttrType::STRINGS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to STRINGS";
+        this->attrs_[name] = std::vector<std::string>();
+        break;
+      }
+      case proto::AttrType::BLOCKS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BLOCKS";
+        this->SetBlocksAttr(name, std::vector<BlockDesc *>());
+        return;
+      }
+      default:
+        PADDLE_THROW("Wrong attr type %d", attr.type());
+    }
+    need_update_ = true;
+    return;
+  }
+
   this->attrs_[name] = v;
   need_update_ = true;
 }
@@ -229,6 +275,19 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
   return it->second;
 }
 
+const proto::OpProto::Attr &OpDesc::GetProtoAttr(
+    const std::string &name) const {
+  const proto::OpProto &proto = OpInfoMap::Instance().Get(Type()).Proto();
+  for (int i = 0; i != proto.attrs_size(); ++i) {
+    const proto::OpProto::Attr &attr = proto.attrs(i);
+    if (attr.name() == name) {
+      return attr;
+    }
+  }
+
+  PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
+}
+
 Attribute OpDesc::GetNullableAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   if (it != attrs_.end()) {
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index b77d84125a23b81c3de4123bea6f0e09cd6d1e90..2422392e24d864dc3e7973ab35e038ecf2c0392a 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -81,6 +81,8 @@ class OpDesc {
 
   Attribute GetAttr(const std::string &name) const;
 
+  const proto::OpProto::Attr &GetProtoAttr(const std::string &name) const;
+
   Attribute GetNullableAttr(const std::string &name) const;
 
   int GetBlockAttrId(const std::string &name) const;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 275cb8c592c3c0b153d31149570cd6596b9e1a7f..81cb24bdda6b87a3d708cf5047dce05d5020a0d5 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
@@ -193,8 +194,14 @@ ParallelExecutor::ParallelExecutor(
       member_->local_scopes_, member_->use_cuda_, build_strategy);
 #endif
 
-  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, places, std::move(graph)));
+  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+  } else {
+    member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+  }
+
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, std::move(var_infos),
       member_->places_, std::move(member_->executor_)));
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 20bdc7830f32564448a69e9cd76c02585b7a1aca..344c001a69b53c82967ee983783892a514c2490b 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -55,11 +55,20 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
     auto all_ops = blocks_[block_id]->AllOps();
     for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
       auto &op = all_ops[op_id];
+
       for (const std::string &attr_name : op->AttrNames()) {
         if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
           int sub_block_id =
               o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
           op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
+        } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) {
+          std::vector<int> sub_block_ids =
+              o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name);
+          std::vector<BlockDesc *> block_descs;
+          for (int block_id : sub_block_ids) {
+            block_descs.push_back(MutableBlock(block_id));
+          }
+          op->SetBlocksAttr(attr_name, block_descs);
         }
       }
     }
@@ -68,24 +77,16 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
 
 ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
   desc_ = desc;
-  for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDesc(this, &block_desc));
-  }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
-        }
-      }
-    }
-  }
+  InitFromProto();
 }
 
 ProgramDesc::ProgramDesc(const std::string &binary_str) {
   PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
                  "Fail to parse program_desc from binary string.");
+  InitFromProto();
+}
+
+void ProgramDesc::InitFromProto() {
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
@@ -95,6 +96,13 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
         if (attr.type() == proto::AttrType::BLOCK) {
           size_t blk_idx = attr.block_idx();
           op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+        } else if (attr.type() == proto::AttrType::BLOCKS) {
+          auto blks_idx = attr.blocks_idx();
+          std::vector<BlockDesc *> block_descs;
+          for (int blk_idx : blks_idx) {
+            block_descs.push_back(this->MutableBlock(blk_idx));
+          }
+          op->SetBlocksAttr(attr.name(), block_descs);
         }
       }
     }
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 65fa0a0cfd5ba6d9b8765cee1309e118cb74348a..f3afc85eb924e4b03b7597e043ffd4e267adc977 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -76,6 +76,8 @@ class ProgramDesc {
   void SetFetchHolderName(const std::string &fetch_holder_name);
 
  private:
+  void InitFromProto();
+
   proto::ProgramDesc desc_;
 
   std::vector<std::unique_ptr<BlockDesc>> blocks_;
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 6c46e9aad5b7fbf67fdcc07a12e7932ac8b6412b..925ea98dbe62e4da91689f6e56c135e51c24a8a3 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -42,6 +42,19 @@ TEST(ProgramDesc, copy_ctor) {
   out->SetType(proto::VarType::LOD_TENSOR);
   op->SetOutput("Y", {out->Name()});
 
+  BlockDesc* new_block = program.AppendBlock(*global_block);
+  op = new_block->AppendOp();
+  op->SetType("mul");
+
+  op = global_block->AppendOp();
+  op->SetType("op_with_subblock");
+  op->SetAttr("sub_block", new_block);
+
+  std::vector<BlockDesc*> sub_blocks;
+  sub_blocks.push_back(program.AppendBlock(*global_block));
+  sub_blocks.push_back(program.AppendBlock(*global_block));
+  op->SetAttr("sub_blocks", sub_blocks);
+
   ProgramDesc program_copy(program);
 
   auto* global_block_copy = program_copy.MutableBlock(0);
@@ -64,6 +77,8 @@ TEST(ProgramDesc, copy_ctor) {
   assert_same_var("Y", y);
   assert_same_var("Out", out);
 
+  bool found_sub_block = false;
+  bool found_sub_blocks = false;
   for (size_t i = 0; i < global_block->OpSize(); ++i) {
     auto op_origin = global_block->Op(i);
     auto op_copy = global_block_copy->Op(i);
@@ -74,8 +89,17 @@ TEST(ProgramDesc, copy_ctor) {
 
     ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
               op_origin->Proto()->SerializeAsString());
-  }
 
+    if (op->Type() == "op_with_subblock") {
+      ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
+      found_sub_block = true;
+
+      ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size());
+      found_sub_blocks = true;
+    }
+  }
+  ASSERT_TRUE(found_sub_block);
+  ASSERT_TRUE(found_sub_blocks);
   // Not check block's protostr are same it because the order of vars could be
   // different and it is correct.
 }
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
new file mode 100644
index 0000000000000000000000000000000000000000..1418fb5134fdde2392da912b5f1bd9fc74e58400
--- /dev/null
+++ b/paddle/fluid/framework/rw_lock.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <pthread.h>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+struct RWLock {
+  RWLock() { pthread_rwlock_init(&lock_, nullptr); }
+
+  ~RWLock() { pthread_rwlock_destroy(&lock_); }
+
+  void RDLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
+                      "acquire read lock failed");
+  }
+
+  void WRLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
+                      "acquire write lock failed");
+  }
+
+  void UNLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+  }
+
+ private:
+  pthread_rwlock_t lock_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/rw_lock_test.cc b/paddle/fluid/framework/rw_lock_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..16f9cbb65229f10912ee90436c3557aaaca169b8
--- /dev/null
+++ b/paddle/fluid/framework/rw_lock_test.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/rw_lock.h"
+#include <gtest/gtest.h>
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
+#include <vector>
+
+namespace f = paddle::framework;
+
+void f1(f::RWLock *lock) {
+  lock->RDLock();
+  lock->UNLock();
+}
+
+TEST(RWLOCK, read_read) {
+  f::RWLock lock;
+  lock.RDLock();
+  std::thread t1(f1, &lock);
+  std::thread t2(f1, &lock);
+  t1.join();
+  t2.join();
+  lock.UNLock();
+}
+
+void f2(f::RWLock *lock, std::vector<int> *result) {
+  lock->RDLock();
+  ASSERT_EQ(result->size(), 0UL);
+  lock->UNLock();
+}
+
+void f3(f::RWLock *lock, std::vector<int> *result) {
+  lock->WRLock();
+  result->push_back(1);
+  lock->UNLock();
+}
+
+TEST(RWLOCK, read_write) {
+  f::RWLock lock;
+  std::vector<int> result;
+
+  lock.RDLock();
+  std::thread t1(f2, &lock, &result);
+  t1.join();
+  std::thread t2(f3, &lock, &result);
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  ASSERT_EQ(result.size(), 0UL);
+  lock.UNLock();
+  t2.join();
+  ASSERT_EQ(result.size(), 1UL);
+}
+
+void f4(f::RWLock *lock, std::vector<int> *result) {
+  lock->RDLock();
+  ASSERT_EQ(result->size(), 1UL);
+  lock->UNLock();
+}
+
+TEST(RWLOCK, write_read) {
+  f::RWLock lock;
+  std::vector<int> result;
+
+  lock.WRLock();
+  std::thread t1(f4, &lock, &result);
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  result.push_back(1);
+  lock.UNLock();
+  t1.join();
+}
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 06ed87e7e8a2d5324b48a466b05207042ec1b7fa..c202b0a5be1f891b8ae7b11e1f6e0ce02fcba588 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -120,66 +120,76 @@ bool SelectedRows::HasKey(int64_t key) const {
                                                                    : true;
 }
 
-std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
-    const std::vector<int64_t>& keys, framework::Tensor* value) const {
+int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
+  rwlock_->RDLock();
+  auto iter = id_to_index_.find(key);
+  if (iter == id_to_index_.end()) {
+    rwlock_->UNLock();
+    if (!auto_grown) {
+      PADDLE_THROW("key %d not found", key);
+    }
+    rwlock_->WRLock();
+    auto map_size = id_to_index_.size();
+    auto vector_size = rows_.size();
+    if (map_size != vector_size) {
+      rwlock_->UNLock();
+      PADDLE_THROW(
+          "id_to_index_ size %d should have the same size with rows_ %d",
+          map_size, vector_size);
+    }
+    auto write_iter = id_to_index_.find(key);
+    if (write_iter == id_to_index_.end()) {
+      size_t row_num = rows_.size();
+      if (row_num == value_->dims()[0]) {
+        rwlock_->UNLock();
+        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+      }
+      // key logic to put a key into id_to_index_
+      rows_.push_back(key);
+      auto index = static_cast<int64_t>(rows_.size() - 1);
+      id_to_index_[key] = index;
+      rwlock_->UNLock();
+      return index;
+    } else {
+      auto index = write_iter->second;
+      rwlock_->UNLock();
+      return index;
+    }
+  } else {
+    auto index = iter->second;
+    rwlock_->UNLock();
+    return index;
+  }
+}
+
+void SelectedRows::SyncIndex() {
+  rwlock_->WRLock();
+  id_to_index_.clear();
+  for (size_t i = 0; i < rows_.size(); ++i) {
+    id_to_index_[rows_[i]] = i;
+  }
+  rwlock_->UNLock();
+}
+
+void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
+                       bool auto_grown) {
   PADDLE_ENFORCE(value->IsInitialized(),
                  "The value tensor should be initialized.");
-  std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
-  if (keys.empty()) {
+  if (ids.numel() == 0) {
     VLOG(3) << "keys is empty, please check data!";
   } else {
     int64_t value_width = value_->numel() / value_->dims()[0];
     PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
                       "output tensor should have the same shape with table "
                       "except the dims[0].");
-
-    for (size_t i = 0; i < keys.size(); ++i) {
-      int64_t index = Index(keys[i]);
-      if (index == -1) {
-        non_keys_pair.push_back(
-            std::make_pair(keys[i], static_cast<int64_t>(i)));
-      } else {
-        framework::VisitDataType(
-            framework::ToDataType(value_->type()),
-            TensorCopyVisitor(value, i * value_width, *value_.get(),
-                              index * value_width, value_width));
-      }
+    for (size_t i = 0; i < ids.numel(); ++i) {
+      int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
+      framework::VisitDataType(
+          framework::ToDataType(value_->type()),
+          TensorCopyVisitor(value, i * value_width, *value_.get(),
+                            index * value_width, value_width));
     }
   }
-  return non_keys_pair;
-}
-
-bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
-  PADDLE_ENFORCE(value.IsInitialized(), "The value should be initialized.");
-  if (value_->IsInitialized()) {
-    PADDLE_ENFORCE_EQ(
-        value.type(), value_->type(),
-        "The type of the value should be same with the original value");
-  }
-  PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
-                    "The first dim of value should be 1.");
-  std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
-  auto index = Index(key);
-  bool is_new_key = false;
-  if (index == -1) {
-    rows_.push_back(key);
-    index = rows_.size() - 1;
-    is_new_key = true;
-    // whether need to resize the table
-    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
-      auto dims = value_->dims();
-      dims[0] = (dims[0] + 1) << 1;
-      framework::VisitDataType(framework::ToDataType(value.type()),
-                               ReAllocateVisitor(dims, value_.get()));
-    }
-  }
-
-  framework::VisitDataType(
-      framework::ToDataType(value.type()),
-      TensorCopyVisitor(value_.get(),
-                        index * value_->numel() / value_->dims()[0], value,
-                        static_cast<int64_t>(0), value.numel()));
-  return is_new_key;
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 7160670ddd204c20021ea87cdd67ee4721d03451..daf5e95304fb84eaba26a30c45414d5021e7ffcb 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -17,10 +17,12 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 
@@ -48,13 +50,13 @@ class SelectedRows {
   SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
       : rows_(rows), height_(height) {
     value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
   }
 
   SelectedRows() {
     height_ = 0;
     value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
   }
 
   platform::Place place() const { return value_->place(); }
@@ -74,47 +76,51 @@ class SelectedRows {
   void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
 
   /*
-   * @brief wheter has the specified key in the table.
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      PADDLE_THROW("id %s not in table", key);
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
+  /*
+   * @brief whether has the specified key in the table.
    *
    * @return true if the key is exists.
    */
   bool HasKey(int64_t key) const;
 
   /*
-   * @brief Get value by the key list, if the
+   * @brief Get value by the key list.
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
    *
    * @return a list of pair which contains the non-exists key and the index in
    * the value
    */
-  std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys,
-                                               framework::Tensor* value) const;
+  void Get(const framework::Tensor& ids, framework::Tensor* value,
+           bool auto_grown = false);
 
   /*
-   * @brief Set a key-value pair into the table.
-   *  This function will double the value memory if it's not engouth.
+   * @brief Get the index of the key from id_to_index_ map. If the key not
+   * exist,
+   * add the key into id_to_index_.
    *
-   * @note:
-   *    1. The first dim of the value should be 1
-   *    2. The value should be initialized and the data type
-   *       should be the same with the table.
-   *
-   * @return true if the key is a new one, otherwise false
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
    *
+   * @return index of the key.
    */
-  bool Set(int64_t key, const Tensor& value);
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown);
 
-  /*
-   * @brief Get the index of key in rows
-   *
-   * @return -1 if the key does not exists.
-   */
-  int64_t Index(int64_t key) const {
-    auto it = std::find(rows_.begin(), rows_.end(), key);
-    if (it == rows_.end()) {
-      return static_cast<int64_t>(-1);
-    }
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
+  void SyncIndex();
 
   DDim GetCompleteDims() const {
     std::vector<int64_t> dims = vectorize(value_->dims());
@@ -127,9 +133,10 @@ class SelectedRows {
   // SelectedRows are simply concated when adding together. Until a
   // SelectedRows add a Tensor, will the duplicate rows be handled.
   Vector<int64_t> rows_;
+  std::unordered_map<int64_t, int64_t> id_to_index_;
   std::unique_ptr<Tensor> value_{nullptr};
   int64_t height_;
-  std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
+  std::unique_ptr<RWLock> rwlock_{nullptr};
 };
 
 /*
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
index eefcaa5672c5a3debf162f5c8eda653408dcf221..5ca864cfdf7176850dd31dd42ef3306061a742cf 100644
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -9,8 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/selected_rows.h"
+#include <time.h>
+#include <thread>  // NOLINT
+
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/selected_rows.h"
 
 namespace paddle {
 namespace framework {
@@ -59,39 +62,129 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
   ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
 }
 
-TEST_F(SelectedRowsTester, SparseTable) {
+TEST(SelectedRows, SparseTable) {
   platform::CPUPlace cpu;
   SelectedRows table;
+
+  int64_t table_size = 100;
+  int64_t embedding_width = 8;
   // initialize a sparse table
-  table.mutable_value()->Resize(framework::make_ddim({1, 100}));
-  table.mutable_value()->mutable_data<float>(cpu);
-  table.mutable_rows()->push_back(1);
+  table.mutable_value()->Resize(
+      framework::make_ddim({table_size, embedding_width}));
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
+  ASSERT_EQ(table.AutoGrownIndex(10, true), 0);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(6, true), 2);
+  ASSERT_TRUE(table.HasKey(10));
+  ASSERT_TRUE(table.HasKey(8));
+  ASSERT_TRUE(table.HasKey(6));
+  ASSERT_EQ(table.rows().size(), 3);
+
+  framework::Tensor ids;
+  ids.Resize(framework::make_ddim({4}));
+  auto* ids_data = ids.mutable_data<int64_t>(cpu);
+  ids_data[0] = static_cast<int64_t>(6);
+  ids_data[1] = static_cast<int64_t>(6);
+  ids_data[2] = static_cast<int64_t>(8);
+  ids_data[3] = static_cast<int64_t>(10);
 
-  int64_t key = 10000;
-  int64_t non_key = 999;
-  framework::Tensor value;
-  value.Resize(framework::make_ddim({1, 100}));
-  auto ptr = value.mutable_data<float>(cpu);
-  ptr[0] = static_cast<float>(10);
+  framework::Tensor get_value;
+  auto* value_data = get_value.mutable_data<float>(
+      framework::make_ddim({4, embedding_width}), cpu);
+  table.Get(ids, &get_value);
 
-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(1));
-  ASSERT_EQ(table.HasKey(key), false);
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[0 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[1 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[2 * embedding_width + j], 1);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[3 * embedding_width + j], 0);
+  }
+}
 
-  table.Set(key, value);
+void f1(SelectedRows* table, int table_size) {
+  for (int i = 1000000; i > 0; --i) {
+    auto id = i % table_size;
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}
 
-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(2));
-  ASSERT_EQ(table.HasKey(key), true);
-  // check re-allocate
-  ASSERT_EQ(table.value().dims()[0], static_cast<int64_t>(4));
+void f2(SelectedRows* table, int table_size) {
+  for (int i = 0; i < 1000000; ++i) {
+    auto id = i % table_size;
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}
 
-  framework::Tensor get_value;
-  get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
-  std::vector<int64_t> keys({non_key, key});
-  auto non_key_pairs = table.Get(keys, &get_value);
+void f3(SelectedRows* table, int table_size) {
+  clock_t t1 = clock();
+  for (int i = 100000; i > 0; --i) {
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f3 run time:" << t2 - t1 << std::endl;
+}
+
+void f4(SelectedRows* table, int table_size) {
+  clock_t t1 = clock();
+  for (int i = 0; i < 100000; ++i) {
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f4 run time:" << t2 - t1 << std::endl;
+}
+
+TEST(SelectedRows, MultiThreadAutoIndex) {
+  platform::CPUPlace cpu;
+  SelectedRows table;
+
+  int64_t table_size = 100000;
+  int64_t embedding_width = 8;
+  // initialize a sparse table
+  table.mutable_value()->Resize(
+      framework::make_ddim({table_size, embedding_width}));
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
 
-  ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
-  ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1));
-  ASSERT_EQ(non_key_pairs[0].first, non_key);
+  std::thread t1(f1, &table, table_size);
+  std::thread t11(f1, &table, table_size);
+  std::thread t2(f2, &table, table_size);
+  std::thread t22(f2, &table, table_size);
+  t1.join();
+  t11.join();
+  t2.join();
+  t22.join();
+  std::thread t3(f3, &table, table_size);
+  std::thread t4(f4, &table, table_size);
+  t3.join();
+  t4.join();
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 27fe575cb6167a726ff92a8f3d2e47b6f536ba39..4feaed2b0d9cdec735bd3fadc98aa2bad715c209 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,13 +1,17 @@
+cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
 cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+  analyzer.cc
+  helper.cc
+  # passes
   fluid_to_data_flow_graph_pass.cc
   data_flow_graph_to_fluid_pass.cc
   dfg_graphviz_draw_pass.cc
   tensorrt_subgraph_pass.cc
   tensorrt_subgraph_node_mark_pass.cc
-  analyzer.cc
-  helper.cc
-        model_store_pass.cc
-  DEPS framework_proto proto_desc)
+  fluid_to_ir_pass.cc
+  model_store_pass.cc
+  DEPS framework_proto proto_desc ir_pass_manager graph pass)
+
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
@@ -18,7 +22,7 @@ function (inference_analysis_test TARGET)
     if(WITH_TESTING)
         set(options "")
         set(oneValueArgs "")
-        set(multiValueArgs SRCS)
+        set(multiValueArgs SRCS EXTRA_DEPS)
         cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
         set(mem_opt "")
@@ -27,19 +31,51 @@ function (inference_analysis_test TARGET)
         endif()
         cc_test(${TARGET}
                 SRCS "${analysis_test_SRCS}"
-                DEPS analysis
+                DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detecter pass ${analysis_test_EXTRA_DEPS}
                 ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
         set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
     endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 
+set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
+set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
+set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
+set(DITU_RNN_MODEL ${DITU_INSTALL_DIR}/model)
+set(DITU_RNN_DATA ${DITU_INSTALL_DIR}/data.txt)
+
+function (inference_download_and_uncompress target url gz_filename)
+    message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
+    execute_process(COMMAND bash -c "mkdir -p ${DITU_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && wget -q ${url}")
+    execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && tar xzf ${gz_filename}")
+    message(STATUS "finish downloading ${gz_filename}")
+endfunction(inference_download_and_uncompress)
+
+if (NOT EXISTS ${DITU_INSTALL_DIR})
+    inference_download_and_uncompress(ditu_rnn_model ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
+    inference_download_and_uncompress(ditu_rnn_data ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
+endif()
+
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
+		# ir
+		fc_fuse_pass
+		graph_viz_pass
+		infer_clean_graph_pass
+		graph_pattern_detecter
+        infer_clean_graph_pass
+		pass
+    ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
+        --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
+        --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
+
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
 inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
 inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 9318f1089781b30468cf4d3c7151d0dd26e50a9c..7d16364609463e9c48720e772cebee7731dfd452 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -17,22 +17,23 @@
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
 #include "paddle/fluid/inference/analysis/model_store_pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
 
-namespace paddle {
-
-DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true,
+DEFINE_bool(IA_enable_tensorrt_subgraph_engine, false,
             "Enable subgraph to TensorRT engine for acceleration");
 
-DEFINE_string(inference_analysis_graphviz_log_root, "./",
+DEFINE_bool(IA_enable_ir, false, "Turn on IR support");
+
+DEFINE_string(IA_graphviz_log_root, "./",
               "Graphviz debuger for data flow graphs.");
 
-DEFINE_string(inference_analysis_output_storage_path, "",
-              "optimized model output path");
+DEFINE_string(IA_output_storage_path, "", "optimized model output path");
 
+namespace paddle {
 namespace inference {
 namespace analysis {
 
@@ -40,11 +41,38 @@ class DfgPassManagerImpl final : public DfgPassManager {
  public:
   DfgPassManagerImpl() {
     // TODO(Superjomn) set the key with pass reprs.
-    AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
-    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
+    LOG(INFO)
+        << "-----------------------------------------------------------------";
+    if (FLAGS_IA_enable_ir) {
+      AddPass("fluid-to-ir-pass", new FluidToIrPass);
+    } else {
+      AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    }
+    TryAddTensorRtPass();
+    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+    if (!FLAGS_IA_output_storage_path.empty()) {
+      AddPass("model-store-pass", new ModelStorePass);
+    }
+    LOG(INFO)
+        << "-----------------------------------------------------------------";
+  }
+
+  std::string repr() const override { return "dfg-pass-manager"; }
+  std::string description() const override { return "DFG pass manager."; }
+
+ private:
+  void AddPass(const std::string& name, Pass* pass) {
+    VLOG(3) << "Adding pass " << name;
+    Register(name, pass);
+    AddGraphvizDebugerPass(pass);
+  }
+
+  void TryAddTensorRtPass() {
+    if (FLAGS_IA_enable_tensorrt_subgraph_engine) {
       auto trt_teller = [&](const Node* node) {
         std::unordered_set<std::string> teller_set(
-            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"});
+            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
+             "depthwise_conv2d", "batch_norm"});
         if (!node->IsFunction()) return false;
 
         const auto* func = static_cast<const Function*>(node);
@@ -59,20 +87,6 @@ class DfgPassManagerImpl final : public DfgPassManager {
               new TensorRTSubgraphNodeMarkPass(trt_teller));
       AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
     }
-    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
-    if (!FLAGS_inference_analysis_output_storage_path.empty()) {
-      AddPass("model-store-pass", new ModelStorePass);
-    }
-  }
-
-  std::string repr() const override { return "dfg-pass-manager"; }
-  std::string description() const override { return "DFG pass manager."; }
-
- private:
-  void AddPass(const std::string& name, Pass* pass) {
-    LOG(INFO) << "Adding pass " << name;
-    Register(name, pass);
-    AddGraphvizDebugerPass(pass);
   }
 
   // Add the graphviz debuger pass if the parent pass has one.
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index c82fdfff86c91b4e07e3c1b80987d3d8d796ad23..2e107c82dd50d5cf22797f4c82e69d302514f955 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -39,14 +39,14 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 
-namespace paddle {
-
 // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
 // flag if not available.
-DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
-DECLARE_string(inference_analysis_graphviz_log_root);
-DECLARE_string(inference_analysis_output_storage_path);
+DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
+DECLARE_string(IA_graphviz_log_root);
+DECLARE_string(IA_output_storage_path);
+DECLARE_bool(IA_enable_ir);
 
+namespace paddle {
 namespace inference {
 namespace analysis {
 
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 24bfb3993cf569561980006b6627b56327dd0f67..52f5c4f5aea387c947ee909b79dae8a1bfb89d82 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -13,15 +13,23 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
+
 #include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
+DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
 TEST(Analyzer, analysis_without_tensorrt) {
-  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
   Argument argument;
   argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
@@ -29,13 +37,327 @@ TEST(Analyzer, analysis_without_tensorrt) {
 }
 
 TEST(Analyzer, analysis_with_tensorrt) {
-  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = true;
   Argument argument;
   argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
 
+void TestWord2vecPrediction(const std::string &model_path) {
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  auto predictor =
+      ::paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+          config);
+
+  // One single batch
+
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> slots(4, tensor);
+  std::vector<PaddleTensor> outputs;
+  CHECK(predictor->Run(slots, &outputs));
+
+  PADDLE_ENFORCE(outputs.size(), 1UL);
+  // Check the output buffer size and result of each tid.
+  PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+  float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                     0.000932706};
+  const size_t num_elements = outputs.front().data.length() / sizeof(float);
+  // The outputs' buffers are in CPU memory.
+  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+    LOG(INFO) << "data: "
+              << static_cast<float *>(outputs.front().data.data())[i];
+    PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i],
+                   result[i]);
+  }
+}
+
+namespace {
+
+struct DataRecord {
+  std::vector<std::vector<std::vector<float>>> link_step_data_all;
+  std::vector<std::vector<float>> week_data_all, minute_data_all;
+  std::vector<size_t> lod1, lod2, lod3;
+  std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
+      rnn_minute_datas;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  DataRecord() = default;
+  DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= link_step_data_all.size()) {
+      data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
+                                     link_step_data_all.begin() + batch_end);
+      data.week_data_all.assign(week_data_all.begin() + batch_iter,
+                                week_data_all.begin() + batch_end);
+      data.minute_data_all.assign(minute_data_all.begin() + batch_iter,
+                                  minute_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod1.push_back(0);
+      data.lod2.push_back(0);
+      data.lod3.push_back(0);
+      CHECK(!data.link_step_data_all.empty()) << "empty";
+      CHECK(!data.week_data_all.empty());
+      CHECK(!data.minute_data_all.empty());
+      CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size());
+      CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size());
+      for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
+        for (const auto &d : data.link_step_data_all[j]) {
+          data.rnn_link_data.push_back(d);
+        }
+        data.rnn_week_datas.push_back(data.week_data_all[j]);
+        data.rnn_minute_datas.push_back(data.minute_data_all[j]);
+        // calculate lod
+        data.lod1.push_back(data.lod1.back() +
+                            data.link_step_data_all[j].size());
+        data.lod3.push_back(data.lod3.back() + 1);
+        for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) {
+          data.lod2.push_back(data.lod2.back() +
+                              data.link_step_data_all[j].size());
+        }
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ':', &data);
+      std::vector<std::vector<float>> link_step_data;
+      std::vector<std::string> link_datas;
+      split(data[0], '|', &link_datas);
+      for (auto &step_data : link_datas) {
+        std::vector<float> tmp;
+        split_to_float(step_data, ',', &tmp);
+        link_step_data.push_back(tmp);
+      }
+      // load week data
+      std::vector<float> week_data;
+      split_to_float(data[2], ',', &week_data);
+      // load minute data
+      std::vector<float> minute_data;
+      split_to_float(data[1], ',', &minute_data);
+      link_step_data_all.push_back(std::move(link_step_data));
+      week_data_all.push_back(std::move(week_data));
+      minute_data_all.push_back(std::move(minute_data));
+    }
+  }
+};
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  // DataRecord data(FLAGS_datapath, batch_size);
+  PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
+      week_tensor, minute_tensor;
+  lod_attention_tensor.name = "data_lod_attention";
+  init_zero_tensor.name = "cell_init";
+  lod_tensor_tensor.name = "data";
+  week_tensor.name = "week";
+  minute_tensor.name = "minute";
+  auto one_batch = data->NextBatch();
+  // clang-format off
+  std::vector<int> rnn_link_data_shape
+      ({static_cast<int>(one_batch.rnn_link_data.size()), static_cast<int>(one_batch.rnn_link_data.front().size())});
+  lod_attention_tensor.shape.assign({1, 2});
+  lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
+  init_zero_tensor.shape.assign({batch_size, 15});
+  init_zero_tensor.lod.assign({one_batch.lod3});
+  lod_tensor_tensor.shape = rnn_link_data_shape;
+  lod_tensor_tensor.lod.assign({one_batch.lod1});
+  week_tensor.shape.assign({(int) one_batch.rnn_week_datas.size(), (int) one_batch.rnn_week_datas.front().size()});
+  week_tensor.lod.assign({one_batch.lod3});
+  minute_tensor.shape.assign({(int) one_batch.rnn_minute_datas.size(),
+                              (int) one_batch.rnn_minute_datas.front().size()});
+  minute_tensor.lod.assign({one_batch.lod3});
+  // assign data
+  TensorAssignData(&lod_attention_tensor, std::vector<std::vector<float>>({{0, 0}}));
+  std::vector<float> tmp_zeros(batch_size * 15, 0.);
+  TensorAssignData(&init_zero_tensor, {tmp_zeros});
+  TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data);
+  TensorAssignData(&week_tensor, one_batch.rnn_week_datas);
+  TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas);
+  // clang-format on
+  // Set inputs.
+  auto init_zero_tensor1 = init_zero_tensor;
+  init_zero_tensor1.name = "hidden_init";
+  input_slots->assign({week_tensor, init_zero_tensor, minute_tensor,
+                       init_zero_tensor1, lod_attention_tensor,
+                       lod_tensor_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::FLOAT32;
+  }
+}
+
+std::string DescribeTensor(const PaddleTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name << "]\n";
+  os << " - type: ";
+  switch (tensor.dtype) {
+    case PaddleDType::FLOAT32:
+      os << "float32";
+      break;
+    case PaddleDType::INT64:
+      os << "int64";
+      break;
+    default:
+      os << "unset";
+  }
+  os << '\n';
+
+  os << " - shape: " << to_string(tensor.shape) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+
+  // clang-format off
+  int dim = std::accumulate(tensor.shape.begin(),
+                            tensor.shape.end(),
+                            1,
+                            [](int a, int b) { return a * b; });  // clang-format on
+  for (size_t i = 0; i < dim; i++) {
+    os << static_cast<float *>(tensor.data.data())[i] << " ";
+  }
+  os << '\n';
+  return os.str();
+}
+
+}  // namespace
+
+const float ditu_rnn_target_data[] = {
+    104.711, 11.2431, 1.35422, 0,       0,       0,       0,       0,
+    27.7039, 1.41486, 7.09526, 0,       0,       0,       0,       0,
+    7.6481,  6.5324,  56.383,  2.88018, 8.92918, 132.007, 4.27429, 2.02934,
+    14.1727, 10.7461, 25.0616, 16.0197, 14.4163, 16.9199, 6.75517, 0,
+    80.0249, 4.77739, 0,       0,       0,       0,       0,       0,
+    47.5643, 2.67029, 8.76252, 0,       0,       0,       0,       0,
+    51.8822, 4.4411,  0,       0,       0,       0,       0,       0,
+    10.7286, 12.0595, 10.6672, 0,       0,       0,       0,       0,
+    93.5771, 3.84641, 0,       0,       0,       0,       0,       0,
+    169.426, 0,       0,       0,       0,       0,       0,       0};
+// Test with a really complicate model.
+void TestDituRNNPrediction(const std::string &model_path,
+                           const std::string &data_path, int batch_size,
+                           bool use_analysis, bool activate_ir,
+                           int num_times = 1) {
+  FLAGS_IA_enable_ir = activate_ir;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
+  FLAGS_IA_output_storage_path = "./analysis.out";
+
+  std::string model_out;
+  if (use_analysis) {
+    Argument argument(model_path);
+    argument.model_output_store_path.reset(new std::string("./analysis.out"));
+
+    Analyzer analyzer;
+    analyzer.Run(&argument);
+
+    // Should get the transformed model stored to ./analysis.out
+    model_out = "./analysis.out";
+    ASSERT_TRUE(PathExists(model_out));
+  } else {
+    model_out = FLAGS_infer_ditu_rnn_model;
+  }
+
+  NativeConfig config;
+  config.prog_file = model_out + "/__model__";
+  config.param_file = model_out + "/param";
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  std::vector<PaddleTensor> input_slots;
+  DataRecord data(data_path, batch_size);
+  // Prepare inputs.
+  PrepareInputs(&input_slots, &data, batch_size);
+  std::vector<PaddleTensor> outputs;
+
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < num_times; i++) {
+    predictor->Run(input_slots, &outputs);
+  }
+  LOG(INFO) << "time/batch: " << timer.toc() / num_times;
+
+  for (auto &out : outputs) {
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    float *data = static_cast<float *>(out.data.data());
+    for (int i = 0;
+         i < std::min(sizeof(ditu_rnn_target_data) / sizeof(float), size);
+         i++) {
+      EXPECT_NEAR(data[i], ditu_rnn_target_data[i], 1e-3);
+    }
+  }
+}
+
+// Turn on the IR pass supportion, run a real inference and check the result.
+TEST(Analyzer, SupportIRPass) {
+  FLAGS_IA_enable_ir = true;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
+  FLAGS_IA_output_storage_path = "./analysis.out";
+
+  Argument argument(FLAGS_inference_model_dir);
+  argument.model_output_store_path.reset(new std::string("./analysis.out"));
+
+  Analyzer analyzer;
+  analyzer.Run(&argument);
+
+  // Should get the transformed model stored to ./analysis.out
+  ASSERT_TRUE(PathExists("./analysis.out"));
+
+  // Inference from this path.
+  TestWord2vecPrediction("./analysis.out");
+}
+
+// Directly infer with the original model.
+TEST(Analyzer, DituRNN_without_analysis) {
+  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+                        10, false, false);
+}
+
+// Inference with the original model with the analysis turned on, the analysis
+// module will transform the program to a data flow graph.
+TEST(Analyzer, DituRNN_with_analysis) {
+  LOG(INFO) << "ditu rnn with analysis";
+  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+                        10, true, false, 1);
+}
+
+// Inference with analysis and IR. The IR module will fuse some large kernels.
+TEST(Analyzer, DituRNN_with_analysis_with_IR) {
+  LOG(INFO) << "ditu rnn with analysis and IR fuse";
+  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
+                        10, true, true, 1);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(infer_clean_graph_pass);
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index 7f64bc75ae8ad40a268739cdc36051e76af9f49a..100a7504b8526b3587858dd7783913757ba09895 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -19,14 +19,16 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 namespace analysis {
+using ir_node_t = framework::ir::Node;
+using ir_graph_t = framework::ir::Graph;
 
 // It is a better idea that the inputs and outputs of this graph is set manually
 // before, but there must be a Pass that helps to prune the unnecessary ops that
 // do not contribute to the given targets, so in this pass, analysis and get the
 // inputs and outputs is OK.
 void DataFlowGraph::Build() {
-  inputs.clear();
-  outputs.clear();
+  inputs_.clear();
+  outputs_.clear();
   std::unordered_set<Node *> ins;
   std::unordered_set<Node *> outs;
   for (auto &node : nodes.nodes()) {
@@ -42,18 +44,140 @@ void DataFlowGraph::Build() {
   // similarly, the nodes that in outs but not in ins is the graphs' outputs
   for (auto *in : ins) {
     if (!outs.count(in)) {
-      inputs.push_back(in);
+      inputs_.push_back(in);
     }
   }
   for (auto *out : outs) {
-    if (!outs.count(out)) {
-      outputs.push_back(out);
+    if (!ins.count(out)) {
+      outputs_.push_back(out);
     }
   }
 
   Clean();
 }
 
+void DataFlowGraph::Build(const framework::proto::ProgramDesc &prog) {
+  // insert vars
+  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
+  // will keep updating to its latest alias during the graph-building.
+  std::unordered_map<std::string, size_t> var2id;
+  auto &main_block = prog.blocks(framework::kRootBlockIndex);
+  for (int i = 0; i < main_block.vars_size(); i++) {
+    const auto &var = main_block.vars(i);
+    auto *v = nodes.Create(Node::Type::kValue);
+    v->SetName(var.name());
+    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbMsg(var.SerializeAsString());
+    var2id[var.name()] = v->id();
+  }
+
+  // The variables in a SSA can only write once, so if a variable is written
+  // multiple times(quite common in our ProgramDesc design), multiple alias
+  // Nodes of this variable will be created, and each will just write once.
+
+  // An set that keep all the names of the variables(the original, not alias)
+  // that have been written(as outputs). Once an Op's output variable hit the
+  // set, it should create a new alias and update the global alias for this
+  // variable. And that make a Data Flow Graph a SSA.
+  std::unordered_set<Node *> unique_written_vars;
+  for (int i = 0; i < main_block.ops_size(); i++) {
+    const auto &op = main_block.ops(i);
+    auto *o = nodes.Create(Node::Type::kFunction);
+    o->SetName(op.type());
+    static_cast<Function *>(o)->SetFuncType(op.type());
+    // Link to the original protobuf message's memory, make it easier to
+    // generate from a data flow graph to fluid ProgramDesc.
+    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbMsg(op.SerializeAsString());
+
+    // set inputs and outputs
+    for (int j = 0; j < op.inputs_size(); j++) {
+      auto &in_var = op.inputs(j);
+      for (int k = 0; k < in_var.arguments_size(); k++) {
+        auto *in = nodes.GetMutable(var2id.at(in_var.arguments(k)));
+        in->outlinks.push_back(o);
+        o->inlinks.push_back(in);
+        unique_written_vars.insert(in);
+      }
+    }
+    for (int j = 0; j < op.outputs_size(); j++) {
+      auto &out_var = op.outputs(j);
+      for (int k = 0; k < out_var.arguments_size(); k++) {
+        auto *out = nodes.GetMutable(var2id[out_var.arguments(k)]);
+        if (unique_written_vars.count(out)) {
+          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
+          auto *out_alias = nodes.Create(Node::Type::kValue);
+          out_alias->SetName(out->name());
+          out_alias->SetPbDesc(out->pb_desc());
+          out_alias->SetPbMsg(out->pb_msg());
+          var2id[out_alias->name()] =
+              out_alias->id();  // update variable's alias Node
+          LOG(INFO) << "loop found in graph, create SSA alias node ["
+                    << out_alias->repr() << "] for [" << out->repr() << "]";
+          out = out_alias;
+        }
+        out->inlinks.push_back(o);
+        o->outlinks.push_back(out);
+      }
+    }
+  }
+  // Analysis and extract the inputs and outputs of this graph.
+  Build();
+}
+
+void DataFlowGraph::Build(const framework::ir::Graph &graph) {
+  // Create nodes
+  std::unordered_map<ir_node_t *, Node *> ir_node_map;
+  for (auto *ir_node : graph.Nodes()) {
+    Node *x{nullptr};
+    if (ir_node->IsOp()) {
+      PADDLE_ENFORCE(ir_node->Op());
+      VLOG(4) << "get op " << ir_node << " " << ir_node->Name();
+      x = nodes.Create(Node::Type::kFunction);
+      x->attr("ir_node").Pointer() = ir_node;
+      PADDLE_ENFORCE(ir_node->Op()->Proto());
+      x->SetName(ir_node->Op()->Proto()->type());
+      x->SetPbMsg(ir_node->Op()->Proto()->SerializeAsString());
+    } else if (ir_node->IsVar()) {
+      // Not create a Node for IR ControlDepVar, considering Inference currently
+      // just used in single thread scenerio.
+      VLOG(4) << "get var " << ir_node->Name();
+      x = nodes.Create(Node::Type::kValue);
+      x->attr("ir_node").Pointer() = ir_node;
+      x->SetName(ir_node->Name());
+      // x->SetPbMsg(ir_node->Var()->Proto()->SerializeAsString());
+    } else {
+      PADDLE_THROW("Failed to create an Node from IR, unknown type");
+    }
+    ir_node_map.emplace(ir_node, x);
+  }
+  VLOG(4) << "finish creating Nodes";
+
+  VLOG(4) << "to create edge";
+  // Create links
+  for (auto *ir_node : graph.Nodes()) {
+    auto it = ir_node_map.find(ir_node);
+    // Skip ControlDepVar.
+    if (it == ir_node_map.end()) continue;
+    auto *node = it->second;
+    for (auto *x : ir_node->inputs) {
+      if (!ir_node_map.count(x)) continue;
+      node->inlinks.push_back(ir_node_map.at(x));
+    }
+    for (auto *x : ir_node->outputs) {
+      if (!ir_node_map.count(x)) continue;
+      node->outlinks.push_back(ir_node_map.at(x));
+    }
+  }
+
+  Build();
+  PADDLE_ENFORCE(!inputs_.empty(),
+                 "Can't deduce any inputs from the graph, Is the graph empty?");
+
+  ir_graph = &graph;
+  VLOG(3) << "finished build from IR";
+}
+
 void DataFlowGraph::Clean() {
   for (auto &node : nodes.nodes()) {
     std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
@@ -61,11 +185,9 @@ void DataFlowGraph::Clean() {
     std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
                                             node->outlinks.end());
     if (inlinks_set.size() < node->inlinks.size()) {
-      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
       node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
     }
     if (outlinks_set.size() < node->outlinks.size()) {
-      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
       node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
     }
   }
@@ -112,10 +234,10 @@ GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
     const std::vector<Node *> &source)
     : queue_(source.begin(), source.end()) {}
 
-// GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
-//     GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
-//     : queue_(std::move(other.queue_)),
-//       visited_(std::move(other.visited_)) {}
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
+    : queue_(std::move(other.queue_)),
+      visited_(std::move(other.visited_)) {}
 
 GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
     const GraphTraits<DataFlowGraph>::NodesBFSIterator &other)
@@ -159,7 +281,7 @@ bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
   if (queue_.empty()) return other.queue_.empty();
   if ((!queue_.empty()) && (!other.queue_.empty())) {
     return queue_.front() == other.queue_.front() &&
-           visited_.size() == other.visited_.size();  // here need to check the
+           visited_.size() == other.visited_.size();
     // equality of queue and
     // visited. Just a light but week implementation.
   }
@@ -174,10 +296,10 @@ GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
   for (auto *x : source) stack_.push(x);
 }
 
-// GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
-//     GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
-//     : stack_(std::move(other.stack_)),
-//       visited_(std::move(other.visited_)) {}
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
+    : stack_(std::move(other.stack_)),
+      visited_(std::move(other.visited_)) {}
 
 GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
     const GraphTraits<DataFlowGraph>::NodesDFSIterator &other)
@@ -339,7 +461,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
 
 void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
   std::vector<Node *> op_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
     if (node.type() == Node::Type::kValue || node.deleted()) {
       continue;
     }
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index bb3ec6bbc1d9555386aba8837b019d2511653258..437e097acd24aad384df6712ce0de6106b3b5c65 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/inference/analysis/graph_traits.h"
 #include "paddle/fluid/inference/analysis/node.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -41,19 +42,43 @@ namespace analysis {
  */
 struct DataFlowGraph {
   NodeMap nodes;
-  std::vector<Node *> inputs;
-  std::vector<Node *> outputs;
+  // inputs and outputs are deduced from the graph.
+  // Used to interact with IR.
+  const framework::ir::Graph *ir_graph{nullptr};
 
   // Extract inputs and outputs of the graph.
   void Build();
 
+  void Build(const framework::proto::ProgramDesc &prog);
+
+  // Build a graph from ir::Graph.
+  void Build(const framework::ir::Graph &graph);
+
+  // Get an attribute.
+  AnyAttr &Attr(const std::string &key) { return attrs_[key]; }
+
   // Output a DOT graph file for debug.
   std::string DotString() const;
 
   std::string HumanReadableInfo(bool show_values = true,
                                 bool show_functions = true) const;
 
+  const std::vector<Node *> &inputs() const {
+    PADDLE_ENFORCE(!inputs_.empty(),
+                   "No inputs are deduced, need to Build() first.");
+    return inputs_;
+  }
+  const std::vector<Node *> &outputs() const {
+    PADDLE_ENFORCE(!outputs_.empty(),
+                   "No outputs are deduced, need to Build() first.");
+    return outputs_;
+  }
+
  private:
+  mutable std::vector<Node *> inputs_;
+  mutable std::vector<Node *> outputs_;
+  std::unordered_map<std::string, AnyAttr> attrs_;
+
   // Remove duplicate edges and so on.
   void Clean();
 };
@@ -70,7 +95,7 @@ struct GraphTraits<DataFlowGraph> {
       : public std::iterator<std::forward_iterator_tag, Node *> {
     NodesBFSIterator() = default;
     explicit NodesBFSIterator(const std::vector<Node *> &source);
-    // NodesBFSIterator(NodesBFSIterator &&other) noexcept;
+    NodesBFSIterator(NodesBFSIterator &&other) noexcept;
     // NOTE Heavy to use.
     NodesBFSIterator(const NodesBFSIterator &other);
 
@@ -93,8 +118,8 @@ struct GraphTraits<DataFlowGraph> {
   struct NodesDFSIterator
       : public std::iterator<std::forward_iterator_tag, Node *> {
     NodesDFSIterator() = default;
-    explicit NodesDFSIterator(const std::vector<Node *> &source);
-    // NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+    NodesDFSIterator(const std::vector<Node *> &source);
+    NodesDFSIterator(NodesDFSIterator &&other) noexcept;
     NodesDFSIterator(const NodesDFSIterator &other);
 
     Node &operator*();
@@ -116,7 +141,7 @@ struct GraphTraits<DataFlowGraph> {
   struct NodesTSIterator
       : public std::iterator<std::forward_iterator_tag, Node *> {
     NodesTSIterator() = default;
-    explicit NodesTSIterator(const std::vector<Node *> &source);
+    NodesTSIterator(const std::vector<Node *> &source);
     NodesTSIterator(NodesTSIterator &&other)
         : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
       other.cursor_ = 0;
@@ -138,7 +163,7 @@ struct GraphTraits<DataFlowGraph> {
     size_t cursor_{0};
   };
 
-  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
+  explicit GraphTraits(const DataFlowGraph &graph) : graph_(graph) {}
 
   // default use BFS to visit the nodes.
   iterator_range<NodesBFSIterator> nodes() {
@@ -156,20 +181,20 @@ struct GraphTraits<DataFlowGraph> {
 
  private:
   NodesBFSIterator nodes_bfs_begin() {
-    return NodesBFSIterator(graph_->inputs);
+    return NodesBFSIterator(graph_.inputs());
   }
   NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
 
   NodesDFSIterator nodes_dfs_begin() {
-    return NodesDFSIterator(graph_->inputs);
+    return NodesDFSIterator(graph_.inputs());
   }
   NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
 
-  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); }
+  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_.inputs()); }
   NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
 
  private:
-  DataFlowGraph *graph_;
+  const DataFlowGraph &graph_;
 };
 
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index a881262665f156812da9e1576aa29b05fc398499..1682011c3d8cc9927a4b026b370671798cace625 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
 namespace paddle {
@@ -24,20 +25,18 @@ TEST(DataFlowGraph, BFS) {
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
 
-  for (auto *in : dfg.inputs) {
+  for (auto* in : dfg.inputs()) {
     LOG(INFO) << "inputs: " << in->name() << " "
               << static_cast<int>(in->type());
   }
-  for (auto *out : dfg.outputs) {
+  for (auto* out : dfg.outputs()) {
     LOG(INFO) << "outputs: " << out->name() << " "
               << static_cast<int>(out->type());
   }
 
-  GraphTraits<DataFlowGraph> trait(&dfg);
-  auto nodes = trait.nodes();
   size_t count = 0;
-  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
-    LOG(INFO) << "visiting " << it->name();
+  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes()) {
+    LOG(INFO) << "visiting " << node.name();
     ++count;
   }
   ASSERT_EQ(count, dfg.nodes.size());
@@ -45,13 +44,11 @@ TEST(DataFlowGraph, BFS) {
 
 TEST(DataFlowGraph, DFS) {
   auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  auto dfg = ProgramDescToDFG(desc);
-  dfg.Build();
-  GraphTraits<DataFlowGraph> trait(&dfg);
-  auto nodes = trait.nodes_in_DFS();
+  DataFlowGraph dfg;
+  dfg.Build(desc);
   size_t count = 0;
-  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
-    LOG(INFO) << "visiting " << it->name();
+  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes_in_DFS()) {
+    LOG(INFO) << "visiting " << node.name();
     ++count;
   }
   ASSERT_EQ(count, dfg.nodes.size());
@@ -74,21 +71,17 @@ TEST(DataFlowGraph, TS) {
   DataFlowGraph graph;
 
   for (int i = 0; i < 8; i++) {
-    auto *node = graph.nodes.Create(Node::Type::kValue);
+    auto* node = graph.nodes.Create(Node::Type::kValue);
     node->SetName("node-" + std::to_string(i));
   }
 
   auto add_link = [&](int i, int j) {
-    Node *source = graph.nodes.GetMutable(i);
-    Node *target = graph.nodes.GetMutable(j);
+    Node* source = graph.nodes.GetMutable(i);
+    Node* target = graph.nodes.GetMutable(j);
     target->inlinks.push_back(source);
     source->outlinks.push_back(target);
   };
 
-  graph.inputs.push_back(graph.nodes.GetMutable(0));
-  graph.inputs.push_back(graph.nodes.GetMutable(1));
-  graph.inputs.push_back(graph.nodes.GetMutable(2));
-
   add_link(0, 4);
   add_link(0, 5);
   add_link(1, 6);
@@ -97,8 +90,9 @@ TEST(DataFlowGraph, TS) {
   add_link(4, 7);
   add_link(4, 3);
   add_link(7, 3);
+  graph.Build();
 
-  auto its = GraphTraits<DataFlowGraph>(&graph).nodes_in_TS();
+  auto its = GraphTraits<DataFlowGraph>(graph).nodes_in_TS();
   std::vector<int> sorted_ids;
   for (auto it = its.begin(); it != its.end(); ++it) {
     LOG(INFO) << it->name();
@@ -122,6 +116,50 @@ TEST(DataFlowGraph, TS) {
   assert_positive_sequence_pair(4, 7);
 }
 
+TEST(DataFlowGraph, Build_ProgramDesc) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
+  DataFlowGraph graph;
+  graph.Build(desc);
+  ASSERT_EQ(graph.nodes.size(), 38UL);
+}
+
+void SetOp(framework::ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetInput("Xs", inputs);
+  op->SetOutput("Xs", outputs);
+}
+
+TEST(DataFlowGraph, Build_IR_Graph) {
+  framework::ProgramDesc prog;
+  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(framework::proto::VarType::SELECTED_ROWS);
+    if (v == "c") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
+        std::vector<std::string>({"d"}));
+  SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
+        std::vector<std::string>({"f"}));
+
+  DataFlowGraph graph;
+
+  framework::ir::Graph ir_graph(prog);
+
+  graph.Build(ir_graph);
+
+  ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size());
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 18c32fa09199003f17183207828cdfe4e627ae1a..8c7dd146e429a7f5cd28bdd418e457e8ea5680bd 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -23,9 +23,6 @@
 namespace paddle {
 namespace inference {
 
-DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size");
-DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
-
 namespace analysis {
 
 using framework::proto::ProgramDesc;
@@ -52,19 +49,15 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
 bool DataFlowGraphToFluidPass::Finalize() { return true; }
 
 void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
-  FilterRedundantOutputOfSubGraph(graph);
-  LOG(INFO) << "graph.inputs " << graph->inputs.size();
-  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+  // FilterRedundantOutputOfSubGraph(graph);
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
     if (node.deleted()) continue;
 
     switch (node.type()) {
       case Node::Type::kFunction: {
-        LOG(INFO) << "add function " << node.repr();
         AddFluidOp(&node);
       } break;
       case Node::Type::kFunctionBlock: {
-        LOG(INFO) << "add engine op " << node.repr() << " , "
-                  << static_cast<FunctionBlock *>(&node)->subgraph.size();
         AddEngineOp(&node);
       } break;
       default:
@@ -76,15 +69,27 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
 }
 
 void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
-  auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
+  PADDLE_ENFORCE(node);
+  PADDLE_ENFORCE(node->IsFunction());
+  PADDLE_ENFORCE(node->pb_desc() || !node->pb_msg().empty(),
+                 "node has invalid protobuf repr.");
+
   // currently only the main block is analyzed.
+  PADDLE_ENFORCE(desc_);
   auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
   auto *op = main_block->add_ops();
-  *op = *ori_op;  // copy the attributes, by default, these will not be changed
-  // by analysis phrase.
-  // The inputs and outputs of the existing ops are not changed by tensorrt
-  // subgraph pass.
-  // NOTE It might be changed by other passes in the long run.
+
+  if (node->pb_desc()) {
+    auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
+    *op =
+        *ori_op;  // copy the attributes, by default, these will not be changed
+    // by analysis phrase.
+    // The inputs and outputs of the existing ops are not changed by tensorrt
+    // subgraph pass.
+    // NOTE It might be changed by other passes in the long run.
+  } else {
+    op->ParseFromString(node->pb_msg());
+  }
 }
 
 void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
@@ -191,8 +196,6 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
   // Set attrs
   SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
   SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
-  SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
-  SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
   SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
   SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
   node->SetPbMsg(desc.Proto()->SerializeAsString());
@@ -221,10 +224,9 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
   framework::BlockDesc block_desc(nullptr, &proto);
   block_desc.Proto()->set_parent_idx(-1);
   block_desc.Proto()->set_idx(0);
-  LOG(INFO) << "origin variable size: "
-            << argument_->origin_program_desc->blocks(0).vars().size();
-  LOG(INFO) << "transformed variable size: "
-            << block_desc.Proto()->vars().size();
+  VLOG(4) << "origin variable size: "
+          << argument_->origin_program_desc->blocks(0).vars().size();
+  VLOG(4) << "transformed variable size: " << block_desc.Proto()->vars().size();
   // copy ops.
 
   for (auto *node : block_node->subgraph) {
@@ -258,7 +260,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 
 Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_inference_analysis_graphviz_log_root,
+      FLAGS_IA_graphviz_log_root,
       "data_flow_graph_to_fluid_graphviz_debugger"));
 }
 
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index 59c47365aa6c8ad5886c4515850d264f69cc4670..0c9a8a0b7cae17bf2eaa714348ea1c9b5e43611b 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -27,9 +27,6 @@
 namespace paddle {
 namespace inference {
 
-DECLARE_int32(tensorrt_max_batchsize);
-DECLARE_int32(tensorrt_workspace_size);
-
 namespace analysis {
 class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
  public:
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
index c05b0e5d4690d0a447edf63a149903704bc2c9be..648b8f7d6a6ec4bafbad2838c5631e776c8699b1 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
 
   auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
   std::string message;
-  LOG(INFO) << "draw to " << png_path;
+  VLOG(3) << "draw to " << png_path;
   ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
 }
 
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 511631d3e067f14bc1230d9e4b4d92dbe604e1d4..51bd0ac42d455f68ac5d70f0ce9703dfad6070d4 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -52,72 +52,7 @@ bool FluidToDataFlowGraphPass::Finalize() { return true; }
 void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   PADDLE_ENFORCE(graph);
   PADDLE_ENFORCE(desc_);
-  // insert vars
-  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
-  // will keep updating to its latest alias during the graph-building.
-  std::unordered_map<std::string, size_t> var2id;
-  auto &main_block = desc_->blocks(framework::kRootBlockIndex);
-  for (int i = 0; i < main_block.vars_size(); i++) {
-    const auto &var = main_block.vars(i);
-    auto *v = graph->nodes.Create(Node::Type::kValue);
-    v->SetName(var.name());
-    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
-    v->SetPbMsg(var.SerializeAsString());
-    var2id[var.name()] = v->id();
-  }
-
-  // The variables in a SSA can only write once, so if a variable is written
-  // multiple times(quite common in our ProgramDesc design), multiple alias
-  // Nodes of this variable will be created, and each will just write once.
-
-  // An set that keep all the names of the variables(the original, not alias)
-  // that have been written(as outputs). Once an Op's output variable hit the
-  // set, it should create a new alias and update the global alias for this
-  // variable. And that make a Data Flow Graph a SSA.
-  std::unordered_set<Node *> unique_written_vars;
-  for (int i = 0; i < main_block.ops_size(); i++) {
-    const auto &op = main_block.ops(i);
-    auto *o = graph->nodes.Create(Node::Type::kFunction);
-    o->SetName(op.type());
-    static_cast<Function *>(o)->SetFuncType(op.type());
-    // Link to the original protobuf message's memory, make it easier to
-    // generate from a data flow graph to fluid ProgramDesc.
-    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
-    o->SetPbMsg(op.SerializeAsString());
-
-    // set inputs and outputs
-    for (int j = 0; j < op.inputs_size(); j++) {
-      auto &in_var = op.inputs(j);
-      for (int k = 0; k < in_var.arguments_size(); k++) {
-        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
-        in->outlinks.push_back(o);
-        o->inlinks.push_back(in);
-      }
-    }
-    for (int j = 0; j < op.outputs_size(); j++) {
-      auto &out_var = op.outputs(j);
-      for (int k = 0; k < out_var.arguments_size(); k++) {
-        auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
-        if (unique_written_vars.count(out)) {
-          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
-          auto *out_alias = graph->nodes.Create(Node::Type::kValue);
-          out_alias->SetName(out->name());
-          out_alias->SetPbDesc(out->pb_desc());
-          out_alias->SetPbMsg(out->pb_msg());
-          var2id[out_alias->name()] =
-              out_alias->id();  // update variable's alias Node
-          LOG(INFO) << "loop found in graph, create SSA alias node ["
-                    << out_alias->repr() << "] for [" << out->repr() << "]";
-          out = out_alias;
-        }
-        out->inlinks.push_back(o);
-        o->outlinks.push_back(out);
-        unique_written_vars.insert(out);
-      }
-    }
-  }
-  // Analysis and extract the inputs and outputs of this graph.
-  graph->Build();
+  graph->Build(*desc_);
 }
 
 namespace {
@@ -133,7 +68,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 
 Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
+      FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index d218dcd05015aa4636c16569de4addf4936c8cd5..267a0a84ebf75615e0b390f4a1b3bf3b51793fc7 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -30,7 +30,7 @@ TEST(FluidToDataFlowGraphPass, Test) {
   ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
   pass.Finalize();
   ASSERT_FALSE(argument.main_dfg->DotString().empty());
-  EXPECT_FALSE(argument.main_dfg->inputs.empty());
+  EXPECT_FALSE(argument.main_dfg->inputs().empty());
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..073f49752872cbb65fddc74be75ec28d4dd0bbaf
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa3f8d313bbdd6733fa3878dd7023e125b6ced36
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class FluidToIrPass final : public DataFlowGraphPass {
+ public:
+  FluidToIrPass() = default;
+
+  bool Initialize(Argument *argument) override {
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
+    if (argument->origin_program_desc) {
+      LOG(WARNING) << "argument's origin_program_desc is already set, might "
+                      "duplicate called";
+    }
+    // set fluid model program path
+    if (!argument->fluid_model_program_path) {
+      ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
+      argument->fluid_model_program_path.reset(
+          new std::string(*argument->fluid_model_dir + "/__model__"));
+    }
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
+    // Load program.
+    auto program = LoadProgramDesc(*argument->fluid_model_program_path);
+    argument->origin_program_desc.reset(
+        new framework::proto::ProgramDesc(program));
+    // Create main data flow graph.
+    if (!argument->main_dfg) {
+      argument->main_dfg.reset(new DataFlowGraph);
+    }
+    // Persist the ProgramDesc in graph's attribute. The IR graph just keep the
+    // address, will segfault if the original ProgramDesc destroys.
+    auto &ir_program_p = argument->main_dfg->Attr("ir_program_desc").Pointer();
+    ir_program_p = new framework::ProgramDesc(program);
+
+    argument_ = argument;
+    return true;
+  }
+
+  bool Finalize() override { return true; }
+
+  void Run(DataFlowGraph *graph) override {
+    // Call all the IR Passes
+    IRPassManager ir_passes(*static_cast<framework::ProgramDesc *>(
+        argument_->main_dfg->Attr("ir_program_desc").Pointer()));
+    ir_passes.Apply(std::vector<std::string>(
+        {// Manual update the passes here.
+         "graph_viz_pass", "infer_clean_graph_pass", "graph_viz_pass",
+         "fc_fuse_pass", "graph_viz_pass"}));
+
+    PADDLE_ENFORCE(argument_->main_dfg.get());
+    argument_->main_dfg->Build(ir_passes.graph());
+    // PADDLE_ENFORCE(argument_->main_dfg->IsFullyConnected());
+  }
+
+  std::string repr() const override { return "fluid-to-ir-pass"; }
+
+ private:
+  Argument *argument_{nullptr};
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af934f261baa3807059ce6ab036545594630df58
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(FluidToIrPass, Test) {
+  FluidToIrPass pass;
+  Argument argument(FLAGS_inference_model_dir);
+  pass.Initialize(&argument);
+  pass.Run(argument.main_dfg.get());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(infer_clean_graph_pass);
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index a0f912b251d5ea29594a7f601d5b2bce91201790..5151e2b69ac199dea136535ba445e890596f6227 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <sys/stat.h>
 #include <cstdio>
 #include <fstream>
 #include <string>
@@ -151,6 +152,23 @@ static framework::proto::ProgramDesc LoadProgramDesc(
   return program_desc;
 }
 
+static bool FileExists(const std::string &filepath) {
+  std::ifstream file(filepath);
+  bool exists = file.is_open();
+  file.close();
+  return exists;
+}
+
+static bool PathExists(const std::string &path) {
+  struct stat statbuf;
+  if (stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d849b637bcf3fe3944ad11680bbe041e19a71e24
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include <string>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+IRPassManager::IRPassManager(const ProgramDesc& program) {
+  graph_.reset(new framework::ir::Graph(program));
+}
+
+void IRPassManager::Apply(const std::vector<std::string>& passes) {
+  graph_->Set("graph_viz_path", new std::string("./1.dot"));
+  // Apply all the passes
+  std::string pre_pass;
+  for (const std::string& pass_name : passes) {
+    LOG(WARNING) << "Running IR pass [" << pass_name << "]";
+    auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
+    if (pass_name == "graph_viz_pass") {
+      std::string dot_file_path =
+          "ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot";
+      pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
+    }
+    graph_ = pass->Apply(std::move(graph_));
+    pre_pass = pass_name;
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..3338e37ecf1c591a631fd829a05b07e562af703e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines IRPassManager, it helps control the passes in IR. Inference
+ * phrase will load the model program and parameters from disk, that is quite
+ * different from the training phase.
+ * This manager will control the Passes and make the passes in IR work smoothly
+ * for inference.
+ */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ProgramDesc;
+
+class IRPassManager final {
+ public:
+  IRPassManager(const ProgramDesc& program);
+
+  void Apply(const std::vector<std::string>& passes);
+
+  framework::ir::Graph& graph() const { return *graph_; }
+
+ private:
+  std::unique_ptr<framework::ir::Graph> graph_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc
index 1c429176424bd5c1d8fa5e015c19d698f966880e..c313db08875669010ddcca13aa66b383ee6d26f8 100644
--- a/paddle/fluid/inference/analysis/model_store_pass.cc
+++ b/paddle/fluid/inference/analysis/model_store_pass.cc
@@ -35,19 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) {
   std::stringstream ss;
   // NOTE these commands only works on linux.
   ss << "mkdir -p " << *argument_->model_output_store_path;
-  LOG(INFO) << "run command: " << ss.str();
+  VLOG(3) << "run command: " << ss.str();
   PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
   ss.str("");
 
   ss << "cp " << *argument_->fluid_model_dir << "/*"
      << " " << *argument_->model_output_store_path;
-  LOG(INFO) << "run command: " << ss.str();
+  VLOG(3) << "run command: " << ss.str();
   PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
 
   // Store program
   PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
                           "program desc is not transformed, should call "
                           "DataFlowGraphToFluidPass first.");
+  VLOG(3) << "store analyzed program to "
+          << *argument_->model_output_store_path;
   const std::string program_output_path =
       *argument_->model_output_store_path + "/__model__";
   std::ofstream file(program_output_path, std::ios::binary);
@@ -58,6 +60,8 @@ void ModelStorePass::Run(DataFlowGraph *x) {
   file.write(serialized_message.c_str(), serialized_message.size());
 }
 
+bool ModelStorePass::Finalize() { return true; }
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h
index fac7083925776b6209d49255c9e67b930cb1250b..3a2869e30bd80cfd0756f8e21acb414656620eaa 100644
--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -44,6 +44,8 @@ class ModelStorePass : public DataFlowGraphPass {
     model in the disk, and that model can be reloaded for prediction again.)DD";
   }
 
+  bool Finalize() override;
+
  private:
   Argument* argument_{nullptr};
 };
diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
index 5f3526dd504e77e58d79b4f675db86a22fd0f26b..d6493fc25edf25003504542f1b01c4105754c8df 100644
--- a/paddle/fluid/inference/analysis/model_store_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
@@ -30,7 +30,7 @@ TEST(DFG_StorePass, test) {
   argument.model_output_store_path.reset(
       new std::string("./_dfg_store_pass_tmp"));
   // disable storage in alalyzer
-  FLAGS_inference_analysis_output_storage_path = "";
+  FLAGS_IA_output_storage_path = "";
   analyzer.Run(&argument);
 
   ModelStorePass pass;
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
index f2e918f3ff41d9db0c3ec38561015967bed26f4e..3339b5044df0cf91d00aa9ddad310d4bf263bc3c 100644
--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -20,17 +20,6 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-template <>
-std::string &NodeAttr::As<std::string>() {
-  if (data_.empty()) {
-    type_index_ = std::type_index(typeid(std::string));
-  }
-  PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string)));
-  return data_;
-}
-
-std::string &NodeAttr::String() { return As<std::string>(); }
-
 std::vector<Dot::Attr> Value::dot_attrs() const {
   return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
                                  Dot::Attr("shape", "box"),
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
index 47e524bc5c4a6b1324d5f182053129311487522d..af34156bc2f101465d87cb10e2155745022eb521 100644
--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/device.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace inference {
@@ -37,41 +38,36 @@ namespace analysis {
 class NodeMap;
 
 // A helper class to maintain the status from Pass.
-struct NodeAttr {
+struct AnyAttr {
+  using any_t =
+      boost::variant<bool, float, int32_t, int64_t, void *, std::string>;
   // NOTE T should be a primary type or a struct combined by several primary
   // types.
   // NOTE the STL containers should not use here.
   // Some usages
   //   Attr attr;
   //   attr.Bool() = true;
-
   bool &Bool() { return As<bool>(); }
   float &Float() { return As<float>(); }
   int32_t &Int32() { return As<int32_t>(); }
   int64_t &Int64() { return As<int64_t>(); }
   void *&Pointer() { return As<void *>(); }
-  std::string &String();
+  std::string &String() { return As<std::string>(); }
 
- private:
   template <typename T>
   T &As() {
-    // init storage in the first usage.
-    if (data_.empty()) {
-      VLOG(4) << "resize data to " << sizeof(T);
-      type_index_ = std::type_index(typeid(T));
-      data_.resize(sizeof(T));
+    if (type_index_ == typeid(AnyAttr)) {
+      type_index_ = typeid(T);
+      any_data_ = T();
+    } else {
+      PADDLE_ENFORCE(type_index_ == typeid(T), "fetch error type");
     }
-    PADDLE_ENFORCE(framework::IsType<T>(type_index_),
-                   "type not matched, origin is %s, want %s",
-                   DataTypeNamer::Global().repr(type_index_),
-                   DataTypeNamer::Global().repr<T>());
-    PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
-    return *reinterpret_cast<T *>(&data_[0]);
+    return boost::get<T>(any_data_);
   }
 
  private:
-  std::string data_;
-  std::type_index type_index_{typeid(NodeAttr)};
+  any_t any_data_;
+  std::type_index type_index_{typeid(AnyAttr)};
 };
 
 /*
@@ -108,7 +104,7 @@ class Node {
 
   // Get an additional attribute and convert it to T data type. NOTE this will
   // silently create a new attribute if not exists.
-  NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
+  AnyAttr &attr(const std::string &name) const { return attrs_[name]; }
 
   int id() const { return id_; }
 
@@ -153,7 +149,7 @@ class Node {
   Type type_{Type::kNone};
   // Mark this node is deleted by some pass.
   bool deleted_{false};
-  mutable std::unordered_map<std::string, NodeAttr> attrs_;
+  mutable std::unordered_map<std::string, AnyAttr> attrs_;
 };
 
 class Function;
diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc
index ea832a3a7e47758be9b6bd59a4325ddb576ec446..9207c15373fb4264ff0e738e93ae88e1c08b554c 100644
--- a/paddle/fluid/inference/analysis/node_tester.cc
+++ b/paddle/fluid/inference/analysis/node_tester.cc
@@ -20,6 +20,24 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+TEST(NodeAttr, bool) {
+  AnyAttr x;
+  x.Bool() = true;
+  ASSERT_EQ(x.Bool(), true);
+}
+
+TEST(NodeAttr, int32) {
+  AnyAttr x;
+  x.Int32() = 32;
+  ASSERT_EQ(x.Int32(), 32);
+}
+
+TEST(NodeAttr, string) {
+  AnyAttr x;
+  x.String() = "Hello";
+  ASSERT_EQ(x.String(), "Hello");
+}
+
 TEST(Node, Attr) {
   // Node is an abstract class, use Value instead for they share the same Attr
   // logic.
@@ -27,6 +45,9 @@ TEST(Node, Attr) {
   auto* node = nodes.Create(Node::Type::kValue);
   node->attr("v0").Int32() = 2008;
   ASSERT_EQ(node->attr("v0").Int32(), 2008);
+
+  node->attr("str").String() = "hello world";
+  ASSERT_EQ(node->attr("str").String(), "hello world");
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 6806f9ff7dada2c1e2328e1ffbfd225afefcf474..7719c6f5ff3c940948c7bdbcb25513cdf430281b 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -63,7 +63,7 @@ class Pass {
   // Human-readable short representation.
   virtual std::string repr() const = 0;
   // Human-readable long description.
-  virtual std::string description() const = 0;
+  virtual std::string description() const { return "No DOC"; }
 };
 
 // NodePass process on any Node types.
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
index b428bb22b1f0c5c1a47fc4c46c9070c1ace4a228..cfdca33882ea00a28e3ea51ca5fd77ec9605bf3a 100644
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -22,7 +22,7 @@ namespace analysis {
 bool PassManager::Initialize(Argument* argument) {
   argument_ = argument;
   for (auto& pass : data_) {
-    LOG(INFO) << "Initializing pass " << pass->repr();
+    LOG(WARNING) << "Initializing pass [" << pass->repr() << "]";
     if (!pass->Initialize(argument)) {
       LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
       return false;
@@ -33,8 +33,9 @@ bool PassManager::Initialize(Argument* argument) {
 
 void DfgPassManager::RunAll() {
   PADDLE_ENFORCE(argument_);
+  LOG(INFO) << "Total " << data_.size() << " passes";
   for (auto& pass : data_) {
-    VLOG(4) << "Running pass [" << pass->repr() << "]";
+    LOG(WARNING) << "Running pass [" << pass->repr() << "]";
     pass->Run(argument_->main_dfg.get());
   }
 }
@@ -42,8 +43,7 @@ void DfgPassManager::RunAll() {
 void NodePassManager::RunAll() {
   PADDLE_ENFORCE(argument_);
   PADDLE_ENFORCE(argument_->main_dfg.get());
-  auto trait =
-      GraphTraits<DataFlowGraph>(argument_->main_dfg.get()).nodes_in_DFS();
+  auto trait = GraphTraits<DataFlowGraph>(*argument_->main_dfg).nodes_in_DFS();
   for (auto& node : trait) {
     for (auto& pass : data_) {
       pass->Run(&node);
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 80809d4c43ca08298bad25cf614dcb4117d3f99a..670a8de667494c655bed15aa3e4ce8265448635a 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -34,7 +34,7 @@ inline void MarkOutLinksInSubGraph(const Function *func) {
 }
 
 void SubGraphSplitter::MarkNodesInsideSubGraph() {
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes()) {
     if (node_inside_subgraph_teller_(&node)) {
       node.attr(kMarkerAttrName).Bool() = true;
       if (node.type() == Node::Type::kFunction) {
@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
 
 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
   std::vector<Node *> marked_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes_in_TS()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
     if (node.attr(kMarkerAttrName).Bool()) {
       marked_nodes.push_back(&node);
     }
@@ -153,6 +153,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
       inlink_or_outlink_cleaner(o->inlinks);
     }
   }
+  FilterRedundantOutputOfSubGraph(graph_);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
index f736e385c11add152dc9ab9485bf1de40f80b2f3..9f51fafe0b2a66f9d062a6b751fe7a3bc662ce7c 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -69,8 +69,8 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
 };
 
 Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
-  DFG_GraphvizDrawPass::Config config(
-      FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
+  DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
+                                      "tensorrt_marked_node");
   return new DfgDebuggerPass(config);
 }
 bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 83867e0a2c198f265cb36be3a71546796dfbec67..0ca1af455ca10fa6995ad3a1c33825108a3fd7ad 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,7 +18,10 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api)
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
+  graph_viz_pass fc_fuse_pass
+    infer_clean_graph_pass
+  )
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
@@ -60,19 +63,22 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()
 
-if (WITH_ANAKIN) # only needed in CI
+if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
     # compile the libinference_anakin_api.a and anakin.so.
-    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
-    #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
     function(anakin_target target_name)
       target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     endfunction()
     anakin_target(inference_anakin_api)
-    #anakin_target(inference_anakin_api_shared)
+    anakin_target(inference_anakin_api_shared)
     if (WITH_TESTING)
-        cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
+        cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
                 ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
-                DEPS inference_anakin_api dynload_cuda SERIAL)
-        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+        cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc 
+                ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
+                     --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
+                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
     endif(WITH_TESTING)
 endif()
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 63c3f0d7b3f5c2b9246e2b041796caf5eb562826..5f1e1b548c7b7daa66932571d7053701bc0bd1f6 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
 http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
index 6b374ceefbc180a5c22abe591f12e1c3d89bc64a..ea66aa89b87ba3c25cdcd5eb2c5155a481ef7987 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -13,9 +13,22 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/api_anakin_engine.h"
+
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
+#endif
+
+#include <mkl_service.h>
+#include <omp.h>
+#include <map>
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "framework/core/net/net.h"
+#include "framework/operators/ops.h"
+#include "saber/funcs/timer.h"
+
 namespace paddle {
 
 template <typename Target>
@@ -23,16 +36,24 @@ PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
     const AnakinConfig &config) {
   CHECK(Init(config));
 }
-
+template <>
+PaddleInferenceAnakinPredictor<anakin::X86>::PaddleInferenceAnakinPredictor(
+    const AnakinConfig &config) {
+  omp_set_dynamic(0);
+  omp_set_num_threads(1);
+  mkl_set_num_threads(1);
+  CHECK(Init(config));
+}
 template <typename Target>
 bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
   if (!(graph_.load(config.model_file))) {
-    LOG(FATAL) << "fail to load graph from " << config.model_file;
+    VLOG(3) << "fail to load graph from " << config.model_file;
     return false;
   }
   auto inputs = graph_.get_ins();
   for (auto &input_str : inputs) {
     graph_.ResetBatchSize(input_str, config.max_batch_size);
+    max_batch_size_ = config.max_batch_size;
   }
   // optimization for graph
   if (!(graph_.Optimize())) {
@@ -52,15 +73,15 @@ bool PaddleInferenceAnakinPredictor<Target>::Run(
     std::vector<PaddleTensor> *output_data, int batch_size) {
   for (const auto &input : inputs) {
     if (input.dtype != PaddleDType::FLOAT32) {
-      LOG(ERROR) << "Only support float type inputs. " << input.name
-                 << "'s type is not float";
+      VLOG(3) << "Only support float type inputs. " << input.name
+              << "'s type is not float";
       return false;
     }
     auto d_tensor_in_p = executor_p_->get_in(input.name);
-    auto net_shape = d_tensor_in_p->valid_shape();
+    auto net_shape = d_tensor_in_p->shape();
     if (net_shape.size() != input.shape.size()) {
-      LOG(ERROR) << " input  " << input.name
-                 << "'s shape size should be equal to that of net";
+      VLOG(3) << " input  " << input.name
+              << "'s shape size should be equal to that of net";
       return false;
     }
     int sum = 1;
@@ -79,21 +100,45 @@ bool PaddleInferenceAnakinPredictor<Target>::Run(
     }
     d_tensor_in_p->reshape(tmp_shape);
 
+    if (input.lod.size() > 0) {
+      if (input.lod.size() > 1) {
+        VLOG(3) << " input lod first dim should <=1, but you set "
+                << input.lod.size();
+        return false;
+      }
+      std::vector<int> offset(input.lod[0].begin(), input.lod[0].end());
+      d_tensor_in_p->set_seq_offset(offset);
+      VLOG(3) << "offset.size(): " << offset.size();
+      for (int i = 0; i < offset.size(); i++) {
+        VLOG(3) << offset[i];
+      }
+    }
+
     float *d_data_p = d_tensor_in_p->mutable_data();
-    if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
-                   d_tensor_in_p->valid_size() * sizeof(float),
-                   cudaMemcpyHostToDevice) != 0) {
-      LOG(ERROR) << "copy data from CPU to GPU error";
-      return false;
+
+#ifdef PADDLE_WITH_CUDA
+    if (std::is_same<anakin::NV, Target>::value) {
+      if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
+                     d_tensor_in_p->valid_size() * sizeof(float),
+                     cudaMemcpyHostToDevice) != 0) {
+        VLOG(3) << "copy data from CPU to GPU error";
+        return false;
+      }
+    }
+#endif
+    if (std::is_same<anakin::X86, Target>::value) {
+      memcpy(d_data_p, static_cast<float *>(input.data.data()),
+             d_tensor_in_p->valid_size() * sizeof(float));
     }
-    cudaStreamSynchronize(NULL);
   }
+#ifdef PADDLE_WITH_CUDA
   cudaDeviceSynchronize();
   executor_p_->prediction();
   cudaDeviceSynchronize();
+#endif
 
   if (output_data->empty()) {
-    LOG(ERROR) << "At least one output should be set with tensors' names.";
+    VLOG(3) << "At least one output should be set with tensors' names.";
     return false;
   }
   for (auto &output : *output_data) {
@@ -102,14 +147,22 @@ bool PaddleInferenceAnakinPredictor<Target>::Run(
     if (output.data.length() < tensor->valid_size() * sizeof(float)) {
       output.data.Resize(tensor->valid_size() * sizeof(float));
     }
-    // Copy data from GPU -> CPU
-    if (cudaMemcpy(output.data.data(), tensor->mutable_data(),
-                   tensor->valid_size() * sizeof(float),
-                   cudaMemcpyDeviceToHost) != 0) {
-      LOG(ERROR) << "copy data from GPU to CPU error";
-      return false;
+
+#if PADDLE_WITH_CUDA
+    if (std::is_same<anakin::NV, Target>::value) {
+      // Copy data from GPU -> CPU
+      if (cudaMemcpy(output.data.data(), tensor->mutable_data(),
+                     tensor->valid_size() * sizeof(float),
+                     cudaMemcpyDeviceToHost) != 0) {
+        VLOG(3) << "copy data from GPU to CPU error";
+        return false;
+      }
+    }
+#endif
+    if (std::is_same<anakin::X86, Target>::value) {
+      memcpy(output.data.data(), tensor->mutable_data(),
+             tensor->valid_size() * sizeof(float));
     }
-    cudaStreamSynchronize(NULL);
   }
   return true;
 }
@@ -132,7 +185,7 @@ PaddleInferenceAnakinPredictor<Target>::Clone() {
   auto anakin_predictor_p =
       dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
   if (!anakin_predictor_p) {
-    LOG(ERROR) << "fail to call Init";
+    VLOG(3) << "fail to call Init";
     return nullptr;
   }
   anakin_predictor_p->get_executer().init(graph_);
@@ -162,6 +215,44 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     VLOG(3) << "Anakin Predictor create on unknown platform.";
     return nullptr;
   }
-};
+}
+
+#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
+template <typename Target>
+using executor_t =
+    anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>;
+
+template <typename Target>
+void DisplayOpTimer(executor_t<Target> *net_executor, int epoch) {
+  std::vector<float> op_time = net_executor->get_op_time();
+  auto exec_funcs = net_executor->get_exec_funcs();
+  auto op_param = net_executor->get_op_param();
+  for (int i = 0; i < op_time.size(); i++) {
+    LOG(INFO) << "name: " << exec_funcs[i].name
+              << " op_type: " << exec_funcs[i].op_name
+              << " op_param: " << op_param[i] << " time " << op_time[i] / epoch;
+  }
+  std::map<std::string, float> op_map;
+  for (int i = 0; i < op_time.size(); i++) {
+    auto it = op_map.find(op_param[i]);
+    if (it != op_map.end())
+      op_map[op_param[i]] += op_time[i];
+    else
+      op_map.insert(std::pair<std::string, float>(op_param[i], op_time[i]));
+  }
+  for (auto it = op_map.begin(); it != op_map.end(); ++it) {
+    LOG(INFO) << it->first << "  " << (it->second) / epoch << " ms";
+  }
+}
+#endif
+
+template <typename Target>
+PaddleInferenceAnakinPredictor<Target>::~PaddleInferenceAnakinPredictor() {
+#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
+  DisplayOpTimer<Target>(executor_p_, max_batch_size_);
+#endif
+  delete executor_p_;
+  executor_p_ = nullptr;
+}
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index 836badd9799228c6c294dcad5df73d039d36a1ff..dd08661880d8cc3a9f4401e9af91a3d10e6579b6 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -47,10 +47,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
   anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
   get_executer();
 
-  ~PaddleInferenceAnakinPredictor() override {
-    delete executor_p_;
-    executor_p_ = nullptr;
-  };
+  ~PaddleInferenceAnakinPredictor() override;
 
  private:
   bool Init(const AnakinConfig& config);
@@ -60,6 +57,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
   anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
       executor_p_{nullptr};
   AnakinConfig config_;
+  int max_batch_size_{0};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6183864234e85b89e94821890d9606b082c59233
--- /dev/null
+++ b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
@@ -0,0 +1,315 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <sys/time.h>
+#include <time.h>
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <thread>  // NOLINT
+#include <vector>
+#include "framework/core/net/net.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(datapath, "", "Path of the dataset.");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+
+// Timer for timer
+class Timer {
+ public:
+  double start;
+  double startu;
+  void tic() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    start = tp.tv_sec;
+    startu = tp.tv_usec;
+  }
+  double toc() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    double used_time_ms =
+        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
+    return used_time_ms;
+  }
+};
+
+std::vector<std::string> string_split(std::string in_str,
+                                      std::string delimiter) {
+  std::vector<std::string> seq;
+  int found = in_str.find(delimiter);
+  int pre_found = -1;
+  while (found != std::string::npos) {
+    if (pre_found == -1) {
+      seq.push_back(in_str.substr(0, found));
+    } else {
+      seq.push_back(in_str.substr(pre_found + delimiter.length(),
+                                  found - delimiter.length() - pre_found));
+    }
+    pre_found = found;
+    found = in_str.find(delimiter, pre_found + delimiter.length());
+  }
+  seq.push_back(
+      in_str.substr(pre_found + 1, in_str.length() - (pre_found + 1)));
+  return seq;
+}
+std::vector<std::string> string_split(
+    std::string in_str, std::vector<std::string>& delimiter) {  // NOLINT
+  std::vector<std::string> in;
+  std::vector<std::string> out;
+  out.push_back(in_str);
+  for (auto del : delimiter) {
+    in = out;
+    out.clear();
+    for (auto s : in) {
+      auto out_s = string_split(s, del);
+      for (auto o : out_s) {
+        out.push_back(o);
+      }
+    }
+  }
+  return out;
+}
+
+class Data {
+ public:
+  Data(std::string file_name, int batch_size)
+      : _batch_size(batch_size), _total_length(0) {
+    _file.open(file_name);
+    _file.seekg(_file.end);
+    _total_length = _file.tellg();
+    _file.seekg(_file.beg);
+  }
+  void get_batch_data(std::vector<std::vector<float>>& fea,         // NOLINT
+                      std::vector<std::vector<float>>& week_fea,    // NOLINT
+                      std::vector<std::vector<float>>& time_fea,    // NOLINT
+                      std::vector<long unsigned int>& seq_offset);  // NOLINT
+
+ private:
+  std::fstream _file;
+  int _total_length;
+  int _batch_size;
+};
+
+void Data::get_batch_data(
+    std::vector<std::vector<float>>& fea,          // NOLINT
+    std::vector<std::vector<float>>& week_fea,     // NOLINT
+    std::vector<std::vector<float>>& time_fea,     // NOLINT
+    std::vector<long unsigned int>& seq_offset) {  // NOLINT
+  int seq_num = 0;
+  long unsigned int cum = 0;  // NOLINT
+
+  char buf[10000];
+  seq_offset.clear();
+  seq_offset.push_back(0);
+  fea.clear();
+  week_fea.clear();
+  time_fea.clear();
+  while (_file.getline(buf, 10000)) {
+    std::string s = buf;
+    std::vector<std::string> deli_vec = {":"};
+    std::vector<std::string> data_vec = string_split(s, deli_vec);
+
+    std::vector<std::string> seq;
+    seq = string_split(data_vec[0], {"|"});
+
+    for (auto link : seq) {
+      std::vector<std::string> data = string_split(link, ",");
+      std::vector<float> vec;
+      for (int i = 0; i < data.size(); i++) {
+        vec.push_back(atof(data[i].c_str()));
+      }
+      fea.push_back(vec);
+    }
+    std::vector<std::string> week_data;
+    std::vector<std::string> time_data;
+
+    week_data = string_split(data_vec[2], ",");
+    std::vector<float> vec_w;
+    for (int i = 0; i < week_data.size(); i++) {
+      vec_w.push_back(atof(week_data[i].c_str()));
+    }
+    week_fea.push_back(vec_w);
+
+    time_data = string_split(data_vec[1], ",");
+    std::vector<float> vec_t;
+    for (int i = 0; i < time_data.size(); i++) {
+      vec_t.push_back(atof(time_data[i].c_str()));
+    }
+    time_fea.push_back(vec_t);
+
+    cum += seq.size();
+    seq_offset.push_back(cum);
+
+    seq_num++;
+    if (seq_num >= _batch_size) {
+      break;
+    }
+  }
+}
+
+namespace paddle {
+
+AnakinConfig GetConfig() {
+  AnakinConfig config;
+  // using AnakinConfig::X86 if you need to use cpu to do inference
+  config.target_type = AnakinConfig::X86;
+  config.model_file = FLAGS_model;
+  config.device = 0;
+  config.max_batch_size = 1000;  // the max number of token
+  return config;
+}
+
+void set_tensor(std::string name, std::vector<int> shape,
+                std::vector<PaddleTensor>& vec) {  // NOLINT
+  int sum = 1;
+  std::for_each(shape.begin(), shape.end(), [&](int n) { sum *= n; });
+  float* data = new float[sum];
+  PaddleTensor tensor;
+  tensor.name = name;
+  tensor.shape = shape;
+  tensor.data = PaddleBuf(data, sum);
+  tensor.dtype = PaddleDType::FLOAT32;
+  vec.push_back(tensor);
+}
+
+void single_test() {
+  AnakinConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+
+  int max_batch_size = 1000;
+  std::string feature_file = FLAGS_datapath;
+  Data map_data(feature_file, FLAGS_batch_size);
+  std::vector<std::vector<float>> fea;
+  std::vector<std::vector<float>> week_fea;
+  std::vector<std::vector<float>> time_fea;
+  std::vector<long unsigned int> seq_offset;  // NOLINT
+
+  paddle::PaddleTensor tensor_0, tensor_1, tensor_2;
+  tensor_0.name = "input_0";
+  tensor_1.name = "input_4";
+  tensor_2.name = "input_5";
+
+  PaddleTensor tensor_out;
+  tensor_out.name = "final_output.tmp_1_gout";
+  tensor_out.shape = std::vector<int>({});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
+
+  std::vector<PaddleTensor> inputs;
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+
+  int data_0_dim = 38;
+  int data_1_dim = 10;
+  int data_2_dim = 10;
+  float data_0[max_batch_size * data_0_dim];  // NOLINT
+  float data_1[max_batch_size * data_1_dim];  // NOLINT
+  float data_2[max_batch_size * data_2_dim];  // NOLINT
+
+  int count = 0;
+  while (true) {
+    if (count++ > 0) break;  // only run the first batch in ci.
+    seq_offset.clear();
+    map_data.get_batch_data(fea, week_fea, time_fea, seq_offset);
+    if (seq_offset.size() <= 1) {
+      LOG(FATAL) << "seq_offset.size() <= 1, exit.";
+      break;
+    }
+
+    std::vector<std::vector<long unsigned int>> seq_offset_vec;  // NOLINT
+    seq_offset_vec.push_back(seq_offset);
+    tensor_0.lod = seq_offset_vec;
+
+    int p_shape_0[] = {(int)fea.size(), 1, 1, data_0_dim};       // NOLINT
+    int p_shape_1[] = {(int)week_fea.size(), data_1_dim, 1, 1};  // NOLINT
+    int p_shape_2[] = {(int)time_fea.size(), data_2_dim, 1, 1};  // NOLINT
+
+    std::vector<int> shape_0(p_shape_0, p_shape_0 + 4);
+    std::vector<int> shape_1(p_shape_1, p_shape_1 + 4);
+    std::vector<int> shape_2(p_shape_2, p_shape_2 + 4);
+
+    tensor_0.shape = shape_0;
+    tensor_1.shape = shape_1;
+    tensor_2.shape = shape_2;
+
+    for (int i = 0; i < fea.size(); i++) {
+      memcpy(data_0 + i * data_0_dim, &fea[i][0], sizeof(float) * data_0_dim);
+    }
+    for (int i = 0; i < week_fea.size(); i++) {
+      memcpy(data_1 + i * data_1_dim, &week_fea[i][0],
+             sizeof(float) * data_1_dim);
+    }
+    for (int i = 0; i < time_fea.size(); i++) {
+      memcpy(data_2 + i * data_2_dim, &time_fea[i][0],
+             sizeof(float) * data_2_dim);
+    }
+
+    tensor_0.data =
+        paddle::PaddleBuf(data_0, fea.size() * sizeof(float) * data_0_dim);
+    tensor_1.data =
+        paddle::PaddleBuf(data_1, week_fea.size() * sizeof(float) * data_1_dim);
+    tensor_2.data =
+        paddle::PaddleBuf(data_2, time_fea.size() * sizeof(float) * data_2_dim);
+
+    tensor_0.dtype = paddle::PaddleDType::FLOAT32;
+    tensor_1.dtype = paddle::PaddleDType::FLOAT32;
+    tensor_2.dtype = paddle::PaddleDType::FLOAT32;
+
+    inputs.clear();
+    inputs.push_back(tensor_1);
+    inputs.push_back(tensor_2);
+    inputs.push_back(tensor_0);
+
+    Timer timer;
+    timer.tic();
+    for (int i = 0; i < FLAGS_repeat; i++) predictor->Run(inputs, &outputs);
+
+    LOG(INFO) << "batch_size = " << FLAGS_batch_size
+              << ", repeat = " << FLAGS_repeat
+              << ", sequence_length = " << seq_offset[seq_offset.size() - 1]
+              << ", latency: " << timer.toc() / FLAGS_repeat << "ms";
+
+    float* data_o = static_cast<float*>(outputs[0].data.data());
+    VLOG(3) << "outputs[0].data.length() = " << outputs[0].data.length();
+    for (size_t j = 0; j < outputs[0].data.length(); ++j) {
+      VLOG(3) << "output[" << j << "]: " << data_o[j];
+    }
+  }
+}
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  logger::init(argv[0]);
+
+  paddle::single_test();
+  /* multi-threads
+  std::vector<std::thread> threads;
+  int num = 1;
+  for (int i = 0; i < num; i++) {
+    LOG(INFO) << " thread id : " << i;
+    threads.emplace_back(paddle::single_test);
+  }
+  for (int i = 0; i < num; i++) {
+    threads[i].join();
+  }
+  threads.clear();
+  */
+
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index e31c637e969f7a86f4f185abb4f0f01d3303db75..32a691b81ffc0586a07f4f06d2114fa5da2e18e2 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -137,8 +137,11 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   for (size_t i = 0; i < feed_target_names_.size(); ++i) {
-    VLOG(4) << "setting " << i << "-th target";
-    feed_targets[feed_target_names_[i]] = &feeds[i];
+    if (config_.specify_input_name) {
+      feed_targets[inputs[i].name] = &feeds[i];
+    } else {
+      feed_targets[feed_target_names_[i]] = &feeds[i];
+    }
   }
   // get fetch variable
   std::map<std::string, framework::LoDTensor *> fetch_targets;
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index 45b5a7638b7dc6a54bbd905766fd5c284cb6aea1..9ac037297167fe7de29925ffe36f4d39efb65313 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/operators/tensorrt_engine_op.h"
 
@@ -32,7 +33,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
 
   bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
     VLOG(3) << "Predictor::init()";
-
+    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
+    FLAGS_tensorrt_workspace_size = config_.workspace_size;
     if (config_.use_gpu) {
       place_ = paddle::platform::CUDAPlace(config_.device);
     } else {
@@ -150,3 +152,12 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
 }
 
 }  // namespace paddle
+
+USE_TRT_CONVERTER(elementwise_add_weight);
+USE_TRT_CONVERTER(mul);
+USE_TRT_CONVERTER(conv2d);
+USE_TRT_CONVERTER(relu);
+USE_TRT_CONVERTER(fc);
+USE_TRT_CONVERTER(pool2d);
+USE_TRT_CONVERTER(softmax);
+USE_TRT_CONVERTER(batch_norm);
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
index fcbf9b89d608e7961e3ef81ac1c70e083dae1cc0..8f1a72316d6c146ebc9a86ced739ef088a3b4267 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -23,7 +23,7 @@ namespace paddle {
 DEFINE_string(dirname, "", "Directory of the inference model.");
 
 void CompareTensorRTWithFluid(bool enable_tensorrt) {
-  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = enable_tensorrt;
 
   //# 1. Create PaddlePredictor with a config.
   NativeConfig config0;
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 3e829dd726b132844a45427b7b0b39eedf197496..7824ef2649af81a2390ff3bc537eb7c93c70e402 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -13,16 +13,22 @@ else
   use_gpu_list='false'
 fi
 
+PREFIX=inference-vis-demos%2F
+URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
+
 # download vis_demo data
 function download() {
   dir_name=$1
   mkdir -p $dir_name
   cd $dir_name
-  wget -q ${URL_ROOT}$dir_name.tar.gz
-  tar xzf *.tar.gz
+  if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then
+    echo "${PREFIX}{dir_name}.tar.gz has been downloaded."
+  else
+      wget -q ${URL_ROOT}$dir_name.tar.gz
+      tar xzf *.tar.gz
+  fi
   cd ..
 }
-URL_ROOT=http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F
 mkdir -p data
 cd data
 vis_demo_list='se_resnext50 ocr mobilenet'
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c166cc0622f68e6d527005795c21236ccf43c33
--- /dev/null
+++ b/paddle/fluid/inference/api/helper.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <sys/time.h>
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+namespace inference {
+
+// Timer for timer
+class Timer {
+ public:
+  double start;
+  double startu;
+  void tic() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    start = tp.tv_sec;
+    startu = tp.tv_usec;
+  }
+  double toc() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    double used_time_ms =
+        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
+    return used_time_ms;
+  }
+};
+
+void split(const std::string &str, char sep, std::vector<std::string> *pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+void split_to_float(const std::string &str, char sep, std::vector<float> *fs) {
+  std::vector<std::string> pieces;
+  split(str, sep, &pieces);
+  std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs),
+                 [](const std::string &v) { return std::stof(v); });
+}
+template <typename T>
+std::string to_string(const std::vector<T> &vec) {
+  std::stringstream ss;
+  for (const auto &c : vec) {
+    ss << c << " ";
+  }
+  return ss.str();
+}
+template <>
+std::string to_string<std::vector<float>>(
+    const std::vector<std::vector<float>> &vec) {
+  std::stringstream ss;
+  for (const auto &piece : vec) {
+    ss << to_string(piece) << "\n";
+  }
+  return ss.str();
+}
+template <>
+std::string to_string<std::vector<std::vector<float>>>(
+    const std::vector<std::vector<std::vector<float>>> &vec) {
+  std::stringstream ss;
+  for (const auto &line : vec) {
+    for (const auto &rcd : line) {
+      ss << to_string(rcd) << ";\t";
+    }
+    ss << '\n';
+  }
+  return ss.str();
+}
+// clang-format off
+void TensorAssignData(PaddleTensor *tensor, const std::vector<std::vector<float>> &data) {
+  // Assign buffer
+  int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, [](int a, int b) { return a * b; });
+  tensor->data.Resize(sizeof(float) * dim);
+  int c = 0;
+  for (const auto &f : data) {
+    for (float v : f) { static_cast<float *>(tensor->data.data())[c++] = v; }
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/high_level_api_cn.md b/paddle/fluid/inference/api/high_level_api_cn.md
index 2fb914592cbcb1b0c3f2ef33ff9cf4c295e427b6..442c598978c700f4c438b365b8900db5b65bc5ec 100644
--- a/paddle/fluid/inference/api/high_level_api_cn.md
+++ b/paddle/fluid/inference/api/high_level_api_cn.md
@@ -65,13 +65,13 @@ config.model_dir = "xxx";
 config.use_gpu = false;
 // 创建一个原生的 PaddlePredictor
 auto predictor =
-      paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+      paddle::CreatePaddlePredictor<paddle::NativeConfig, paddle::PaddleEngineKind::kNative>(config);
 // 创建输入 tensor
 int64_t data[4] = {1, 2, 3, 4};
 paddle::PaddleTensor tensor{.name = "",
                             .shape = std::vector<int>({4, 1}),
-                            .data = PaddleBuf(data, sizeof(data)),
-                            .dtype = PaddleDType::INT64};
+                            .data = paddle::PaddleBuf(data, sizeof(data)),
+                            .dtype = paddle::PaddleDType::INT64};
 // 创建输出 tensor，输出 tensor 的内存可以复用
 std::vector<paddle::PaddleTensor> outputs;
 // 执行预测
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 794534467be066e91db2b4c204913ab2cf12dbfd..36fd0727aa7beef4a06a5f2e63ec0c43583ddf84 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -45,7 +45,7 @@ class PaddleBuf {
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
   // Own memory.
-  explicit PaddleBuf(size_t length)
+  PaddleBuf(size_t length)
       : data_(new char[length]), length_(length), memory_owned_(true) {}
   // Resize to `length` bytes.
   void Resize(size_t length);
@@ -70,7 +70,7 @@ struct PaddleTensor {
   std::vector<int> shape;
   PaddleBuf data;  // blob of data.
   PaddleDType dtype;
-  std::vector<std::vector<uint64_t>> lod;  // lod data
+  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
 };
 
 enum class PaddleEngineKind {
@@ -120,6 +120,8 @@ struct NativeConfig : public PaddlePredictor::Config {
   bool use_gpu{false};
   int device{0};
   float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+  // Specify the variable's name of each input.
+  bool specify_input_name{false};
 
   std::string prog_file;
   std::string param_file;
@@ -137,6 +139,14 @@ struct AnakinConfig : public PaddlePredictor::Config {
 struct TensorRTConfig : public NativeConfig {
   // Determine whether a subgraph will be executed by TRT.
   int min_subgraph_size{1};
+  // While TensorRT allows an engine optimized for a given max batch size
+  // to run at any smaller size, the performance for those smaller
+  // sizes may not be as well-optimized. Therefore, Max batch is best
+  // equivalent to the runtime batch size.
+  int max_batch_size{1};
+  // For workspace_size, refer it from here:
+  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
+  int workspace_size{1 << 30};
 };
 
 // A factory to help create different predictors.
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index b52d083f280e5e7713600a7b748dedd37aca0a1e..a610687a5b11999a7cb7426dbe961e5972ee1746 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,4 @@
-nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto)
+nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 6863b035d8cd9dfb21aed3947226a796778912a4..2a449eb95e3537a11962912a6a3f29e89958fbd8 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Add TRT tests
 nv_library(tensorrt_converter
   SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-activation_op.cc softmax_op.cc
+batch_norm_op.cc activation_op.cc softmax_op.cc 
   DEPS tensorrt_engine operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -24,3 +24,6 @@ nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
 
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
+
+nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..94f8b0ae5606d39a722ffe28501645c9b6fc5d2e
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <math.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class BatchNormOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    LOG(INFO) << "convert a fluid batch norm op to tensorrt batch_norm";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1);   // Bias is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1);   // Mean is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1);  // Scale is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Input("Variance").size(),
+                      1);  // Variance is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    // Declare weights
+    auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
+    auto* Mean_v = scope.FindVar(op_desc.Input("Mean").front());
+    auto* Scale_v = scope.FindVar(op_desc.Input("Scale").front());
+    auto* Variance_v = scope.FindVar(op_desc.Input("Variance").front());
+    const float eps = boost::get<float>(op_desc.GetAttr("epsilon"));
+
+    PADDLE_ENFORCE_NOT_NULL(Bias_v);
+    PADDLE_ENFORCE_NOT_NULL(Mean_v);
+    PADDLE_ENFORCE_NOT_NULL(Scale_v);
+    PADDLE_ENFORCE_NOT_NULL(Variance_v);
+
+    // get tensor
+    auto* Bias_t = Bias_v->GetMutable<framework::LoDTensor>();
+    auto* Mean_t = Mean_v->GetMutable<framework::LoDTensor>();
+    auto* Scale_t = Scale_v->GetMutable<framework::LoDTensor>();
+    auto* Variance_t = Variance_v->GetMutable<framework::LoDTensor>();
+
+    // create temp tensor for weights
+    framework::LoDTensor bias_tensor;
+    framework::LoDTensor mean_tensor;
+    framework::LoDTensor scale_tensor;
+    framework::LoDTensor variance_tensor;
+
+    bias_tensor.Resize(Bias_t->dims());
+    mean_tensor.Resize(Mean_t->dims());
+    scale_tensor.Resize(Scale_t->dims());
+    variance_tensor.Resize(Variance_t->dims());
+
+    platform::CPUPlace cpu_place;
+    // copy data from gpu to cpu
+    TensorCopySync((*Bias_t), cpu_place, &bias_tensor);
+    TensorCopySync((*Mean_t), cpu_place, &mean_tensor);
+    TensorCopySync((*Scale_t), cpu_place, &scale_tensor);
+    TensorCopySync((*Variance_t), cpu_place, &variance_tensor);
+
+    auto* bias_data = bias_tensor.mutable_data<float>(platform::CPUPlace());
+    auto* mean_data = mean_tensor.mutable_data<float>(platform::CPUPlace());
+    auto* scale_data = scale_tensor.mutable_data<float>(platform::CPUPlace());
+    auto* variance_data =
+        variance_tensor.mutable_data<float>(platform::CPUPlace());
+
+    std::unique_ptr<framework::LoDTensor> combile_scale_tensor(
+        new framework::LoDTensor());
+    std::unique_ptr<framework::LoDTensor> combile_bias_tensor(
+        new framework::LoDTensor());
+
+    combile_scale_tensor->Resize(scale_tensor.dims());
+    combile_bias_tensor->Resize(bias_tensor.dims());
+
+    auto* combile_scale_data =
+        combile_scale_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* combile_bias_data =
+        combile_bias_tensor->mutable_data<float>(platform::CPUPlace());
+
+    size_t ele_num = combile_scale_tensor->memory_size() / sizeof(float);
+
+    for (size_t i = 0; i < ele_num; i++) {
+      float scale = scale_data[i];
+      float bias = bias_data[i];
+      float mean = mean_data[i];
+      float variance = variance_data[i];
+      combile_scale_data[i] = scale / sqrtf(variance + eps);
+      combile_bias_data[i] = bias - mean * combile_scale_data[i];
+    }
+
+    TensorRTEngine::Weight scale_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(combile_scale_data),
+        combile_scale_tensor->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight shift_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(combile_bias_data),
+        combile_bias_tensor->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    nvinfer1::IScaleLayer* layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast<nvinfer1::ITensor*>(X),
+                             nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(),
+                             scale_weights.get(), power_weights.get());
+
+    auto output_name = op_desc.Output("Y").front();
+    engine_->weight_map[op_desc.Input("Bias").front()] =
+        std::move(combile_bias_tensor);
+    engine_->weight_map[op_desc.Input("Scale").front()] =
+        std::move(combile_scale_tensor);
+
+    engine_->SetITensor(output_name, layer->getOutput(0));
+
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(batch_norm, BatchNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index dba1d50b2d1c487ced8e6ca51f2d257641ad5fc7..841a95db38ce7cf0cb5961ff04cb569ee2633e6f 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -35,12 +35,20 @@ class Conv2dOpConverter : public OpConverter {
     auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
     PADDLE_ENFORCE_NOT_NULL(Y_v);
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
 
-    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
-    const int n_output = Y_t->dims()[0];
-    const int filter_h = Y_t->dims()[2];
-    const int filter_w = Y_t->dims()[3];
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> weight_tensor(
+        new framework::LoDTensor());
+    weight_tensor->Resize(Y_t->dims());
+    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+
+    auto* weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
+
+    PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+    const int n_output = weight_tensor->dims()[0];
+    const int filter_h = weight_tensor->dims()[2];
+    const int filter_w = weight_tensor->dims()[3];
 
     const int groups = boost::get<int>(op_desc.GetAttr("groups"));
     const std::vector<int> dilations =
@@ -57,7 +65,7 @@ class Conv2dOpConverter : public OpConverter {
 
     TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                   static_cast<void*>(weight_data),
-                                  Y_t->memory_size() / sizeof(float)};
+                                  weight_tensor->memory_size() / sizeof(float)};
 
     TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
     auto* layer = TRT_ENGINE_ADD_LAYER(
@@ -70,6 +78,8 @@ class Conv2dOpConverter : public OpConverter {
     layer->setNbGroups(groups);
 
     auto output_name = op_desc.Output("Output").front();
+    engine_->weight_map[op_desc.Input("Filter").front()] =
+        std::move(weight_tensor);
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {
       engine_->DeclareOutput(output_name);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 3744550f60a1696aedd8a3ecd24f1b21d22325b9..60a72b4eb5c75b5cd12305f13763a9a1a567213f 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
@@ -40,10 +39,17 @@ class ElementwiseWeightOpConverter : public OpConverter {
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
     PADDLE_ENFORCE_NOT_NULL(Y_v);
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> weight_tensor(
+        new framework::LoDTensor());
+    weight_tensor->Resize(Y_t->dims());
+    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+    auto* weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
     auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
-    std::vector<int> dims_y = framework::vectorize2int(Y_t->dims());
+    std::vector<int> dims_y = framework::vectorize2int(weight_tensor->dims());
     if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
       if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
     }
@@ -70,9 +76,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
       PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
     }
 
-    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
-                                         static_cast<void*>(weight_data),
-                                         Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight shift_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
+        weight_tensor->memory_size() / sizeof(float)};
     TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                          0};
     TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
@@ -82,6 +88,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
         engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
         shift_weights.get(), scale_weights.get(), power_weights.get());
     auto output_name = op_desc.Output("Out")[0];
+
+    engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 39fe1f609d7b94638506877fc301f19ef33ec8ac..ad98d85aae9cf594922aca00c43718ccfbce2278 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -12,12 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace inference {
@@ -73,19 +68,26 @@ class FcOpConverter : public OpConverter {
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
     // This may trigger a GPU->CPU copy, because TRT's weight can only be
     // assigned from CPU memory, that can't be avoided.
-    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
-    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
-    size_t n_output = Y_t->dims()[1];
+    platform::CPUPlace cpu_place;
+    framework::LoDTensor weight_tensor;
+    weight_tensor.Resize(Y_t->dims());
+    TensorCopySync((*Y_t), cpu_place, &weight_tensor);
 
-    framework::LoDTensor tmp;
-    tmp.Resize(Y_t->dims());
-    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), weight_data,
+    auto* weight_data = weight_tensor.mutable_data<float>(platform::CPUPlace());
+
+    PADDLE_ENFORCE_EQ(weight_tensor.dims().size(), 2UL);  // a matrix
+    size_t n_output = weight_tensor.dims()[1];
+
+    std::unique_ptr<framework::Tensor> tmp(new framework::LoDTensor());
+    tmp->Resize(weight_tensor.dims());
+
+    memcpy(tmp->mutable_data<float>(platform::CPUPlace()), weight_data,
            Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
     TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                   static_cast<void*>(weight_data),
                                   Y_t->memory_size() / sizeof(float)};
     TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
-                                      static_cast<void*>(tmp.data<float>()),
+                                      static_cast<void*>(tmp->data<float>()),
                                       Y_t->memory_size() / sizeof(float));
     weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
     tmp_weight.dims = weight.dims;
@@ -106,6 +108,7 @@ class FcOpConverter : public OpConverter {
 
     auto output_name = op_desc.Output("Out").front();
     engine_->SetITensor(output_name, layer->getOutput(0));
+    engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
     if (test_mode) {
       engine_->DeclareOutput(output_name);
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 11cad95361867476c6f775af778015da37f1cfb1..73f1b28ddf73403862e55d102a259d7b6cf67b1f 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -33,6 +33,7 @@ class Pool2dOpConverter : public OpConverter {
     PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
     auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
 
+    bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
     std::string pool_type =
         boost::get<std::string>(op_desc.GetAttr("pooling_type"));
     std::vector<int> ksize =
@@ -42,7 +43,13 @@ class Pool2dOpConverter : public OpConverter {
     std::vector<int> paddings =
         boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
 
-    const nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    if (global_pooling == true) {
+      nvinfer1::Dims input_shape = input1->getDimensions();
+      int nbDims = input_shape.nbDims;
+      nv_ksize.d[0] = input_shape.d[nbDims - 2];
+      nv_ksize.d[1] = input_shape.d[nbDims - 1];
+    }
     const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
     const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..41412cb079540da72760558379b158b6538aa6a8
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(batch_norm_op, test) {
+  std::unordered_set<std::string> parameters(
+      {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
+       "batch_norm_variance"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+  std::vector<int> param_shape{2};
+
+  validator.DeclInputVar("batch_norm_X", nvinfer1::DimsCHW(2, 5, 5));
+  validator.DeclParamVar("batch_norm_scale", param_shape);
+  validator.DeclParamVar("batch_norm_bias", param_shape);
+  validator.DeclParamVar("batch_norm_mean", param_shape);
+  validator.DeclParamVar("batch_norm_variance", param_shape);
+  validator.DeclOutputVar("batch_norm_Y", nvinfer1::DimsCHW(2, 5, 5));
+  validator.DeclOutputVar("batch_norm_save_mean", param_shape);
+  validator.DeclOutputVar("batch_norm_save_variance", param_shape);
+
+  // Prepare Op description
+  framework::OpDesc desc;
+
+  desc.SetType("batch_norm");
+  desc.SetInput("X", {"batch_norm_X"});
+  desc.SetInput("Scale", {"batch_norm_scale"});
+  desc.SetInput("Bias", {"batch_norm_bias"});
+  desc.SetInput("Mean", {"batch_norm_mean"});
+  desc.SetInput("Variance", {"batch_norm_variance"});
+  desc.SetOutput("Y", {"batch_norm_Y"});
+  desc.SetOutput("MeanOut", {"batch_norm_mean"});
+  desc.SetOutput("VarianceOut", {"batch_norm_variance"});
+  desc.SetOutput("SavedMean", {"batch_norm_save_mean"});
+  desc.SetOutput("SavedVariance", {"batch_norm_save_variance"});
+
+  float eps = 1e-5f;
+  bool is_test = true;
+  desc.SetAttr("epsilon", eps);
+  desc.SetAttr("is_test", is_test);
+
+  validator.SetOp(*desc.Proto());
+
+  std::unordered_set<std::string> neglected_output = {
+      "batch_norm_save_mean", "batch_norm_save_variance", "batch_norm_mean",
+      "batch_norm_variance"};
+  validator.Execute(3, neglected_output);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(batch_norm);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index d6651a5b244ba31a01220e6299cb2016ae61fe64..01d7f700da9cc67d0ebbd3d9649e3823f58a8811 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -57,6 +57,7 @@ TEST(OpConverter, ConvertBlock) {
   auto* x = scope.Var("conv2d-Y");
   auto* x_tensor = x->GetMutable<framework::LoDTensor>();
   x_tensor->Resize(framework::make_ddim(dim_vec));
+  x_tensor->mutable_data<float>(platform::CUDAPlace(0));
 
   OpConverter converter;
   converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope,
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index c5dddbc8cd37b9fb1ba39382af2da5ad045f3af2..aedd6b62df040eeee4e48f628128511cd8bf4439 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(Pool2dOpConverter, main) {
+void test_pool2d(bool global_pooling) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
   TRTConvertValidation validator(5, parameters, scope, 1 << 15);
@@ -28,7 +28,10 @@ TEST(Pool2dOpConverter, main) {
   // The ITensor's Dims should not contain the batch size.
   // So, the ITensor's Dims of input and output should be C * H * W.
   validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
-  validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
+  if (global_pooling)
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1));
+  else
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
 
   // Prepare Op description
   framework::OpDesc desc;
@@ -45,6 +48,7 @@ TEST(Pool2dOpConverter, main) {
   desc.SetAttr("ksize", ksize);
   desc.SetAttr("strides", strides);
   desc.SetAttr("paddings", paddings);
+  desc.SetAttr("global_pooling", global_pooling);
 
   LOG(INFO) << "set OP";
   validator.SetOp(*desc.Proto());
@@ -53,6 +57,10 @@ TEST(Pool2dOpConverter, main) {
   validator.Execute(3);
 }
 
+TEST(Pool2dOpConverter, normal) { test_pool2d(false); }
+
+TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true); }
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 4265f33f28fe36b1745baf4761c3c85e3a281d6b..0a6f171fc40a838fd81d6a51aca0430d5526f188 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
@@ -48,11 +49,17 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
   auto dims = tensor->dims();
   size_t num_elements = analysis::AccuDims(dims, dims.size());
   PADDLE_ENFORCE_GT(num_elements, 0);
-  auto* data = tensor->mutable_data<float>(place);
+
+  platform::CPUPlace cpu_place;
+  framework::LoDTensor temp_tensor;
+  temp_tensor.Resize(dims);
+  auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
 
   for (size_t i = 0; i < num_elements; i++) {
-    *(data + i) = random(0., 1.);
+    *(temp_data + i) = random(0., 1.);
   }
+
+  TensorCopySync(temp_tensor, place, tensor);
 }
 
 /*
@@ -91,18 +98,26 @@ class TRTConvertValidation {
     engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
   }
 
+  void DeclParamVar(const std::string& name, const std::vector<int> dim_vec) {
+    DeclVar(name, dim_vec);
+  }
+
   // Declare a parameter varaible in the scope.
   void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
     DeclVar(name, dims, true);
   }
 
+  void DeclOutputVar(const std::string& name, const std::vector<int> dim_vec) {
+    DeclVar(name, dim_vec);
+  }
+
   void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
     DeclVar(name, dims);
   }
 
   void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
-    platform::CPUPlace place;
-    platform::CPUDeviceContext ctx(place);
+    platform::CUDAPlace place;
+    platform::CUDADeviceContext ctx(place);
 
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
@@ -141,18 +156,22 @@ class TRTConvertValidation {
       PADDLE_ENFORCE(var);
       auto tensor = var->GetMutable<framework::LoDTensor>();
 
-      engine_->SetInputFromCPU(
+      engine_->SetInputFromGPU(
           input, static_cast<void*>(tensor->data<void>()),
           sizeof(float) *
               analysis::AccuDims(tensor->dims(), tensor->dims().size()));
     }
   }
 
-  void Execute(int batch_size) {
+  // We use the set 'neglected_output' here, because some Ops like batch norm,
+  // the outputs specified in the op des are only used during training,
+  // so we should neglect those output during inference.
+  void Execute(int batch_size,
+               std::unordered_set<std::string> neglected_output = {}) {
     // Execute Fluid Op
     PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
-    platform::CPUPlace place;
-    platform::CPUDeviceContext ctx(place);
+    platform::CUDAPlace place;
+    platform::CUDADeviceContext ctx(place);
     op_->Run(scope_, place);
     // Execute TRT.
     engine_->Execute(batch_size);
@@ -161,6 +180,7 @@ class TRTConvertValidation {
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
     const size_t output_space_size = 3000;
     for (const auto& output : op_desc_->OutputArgumentNames()) {
+      if (neglected_output.count(output)) continue;
       std::vector<float> fluid_out;
       std::vector<float> trt_out(output_space_size);
       engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index b821c3d0bf425c46fae634fbf53f7ee63100ca5c..14e9e14d33d637ee68e37593cc48721e5169499f 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -33,6 +33,7 @@ void TensorRTEngine::Build(const DescType &paddle_model) {
 }
 
 void TensorRTEngine::Execute(int batch_size) {
+  freshDeviceId();
   batch_size_ = batch_size;
   std::vector<void *> buffers;
   for (auto &buf : buffers_) {
@@ -60,6 +61,7 @@ TensorRTEngine::~TensorRTEngine() {
 }
 
 void TensorRTEngine::FreezeNetwork() {
+  freshDeviceId();
   PADDLE_ENFORCE(infer_builder_ != nullptr,
                  "Call InitNetwork first to initialize network.");
   PADDLE_ENFORCE(infer_network_ != nullptr,
@@ -241,6 +243,13 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
 
 int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
 
+void TensorRTEngine::freshDeviceId() {
+  int count;
+  cudaGetDeviceCount(&count);
+  PADDLE_ENFORCE_LT(device_, count);
+  cudaSetDevice(device_);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 694468c419c20089de1cdecff1a903ad0cc6e99f..bd3ba4cea6551a7f6651e311e2649de191a6faa1 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/utils/singleton.h"
@@ -52,13 +53,15 @@ class TensorRTEngine : public EngineBase {
   };
 
   TensorRTEngine(int max_batch, int max_workspace,
-                 cudaStream_t* stream = nullptr,
+                 cudaStream_t* stream = nullptr, int device = 0,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
         stream_(stream ? stream : &default_stream_),
-        logger_(logger) {
-    cudaStreamCreate(&default_stream_);
+        logger_(logger),
+        device_(device) {
+    freshDeviceId();
+    cudaStreamCreate(stream_);
   }
 
   virtual ~TensorRTEngine();
@@ -119,6 +122,15 @@ class TensorRTEngine : public EngineBase {
   nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
   void SetRuntimeBatch(size_t batch_size);
   int GetRuntimeBatch();
+  int GetDevice() { return device_; }
+
+  // A pointer to CPU memory is needed of the TRT weight.
+  // Before TRT runs, fluid loads weight into GPU storage.
+  // so we need to copy the weights from GPU to CPU in our op converter.
+  // We use a map to store these weights for the weight memory is not released
+  // in advance, which affecting the construction of TRT Op.
+  std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
+      weight_map;
 
  private:
   // the max batch size
@@ -140,6 +152,8 @@ class TensorRTEngine : public EngineBase {
   std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
   std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
       itensor_map_;
+  // The specific GPU id that the TensorRTEngine bounded to.
+  int device_;
 
   // TensorRT related internal members
   template <typename T>
@@ -156,6 +170,10 @@ class TensorRTEngine : public EngineBase {
   infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
   infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
   infer_ptr<nvinfer1::IExecutionContext> infer_context_;
+  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
+  // ensure that the thread is associated with the correct device by calling
+  // freshDeviceId().
+  void freshDeviceId();
 };  // class TensorRTEngine
 
 // Add an layer__ into engine__ with args ARGS.
@@ -188,8 +206,8 @@ class TRT_EngineManager {
 
   // Create or get an engine called `name`
   TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
-                         const std::string& name) {
-    auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
+                         const std::string& name, int gpu_device = 0) {
+    auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device);
     engines_[name].reset(p);
     return p;
   }
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index dc03702990587bf5e65d28da662d10df4d882110..da1f6535cb3b2476cd475797861d6d2bb6d88856 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,7 +27,7 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    ASSERT_EQ(0, cudaStreamCreate(&stream_));
+    // ASSERT_EQ(0, cudaStreamCreate(&stream_));
     engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
     engine_->InitNetwork();
   }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e8b5dec9d49f5613cec92441d19ab7dc1a1ad90c..68fbde2c09fd9a9e84fd7f1202fe474beb0e81b9 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -9,7 +9,6 @@ function(op_library TARGET)
     # op_library is a function to create op library. The interface is same as
     # cc_library. But it handle split GPU/CPU code and link some common library
     # for ops.
-    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
     set(cc_srcs)
     set(cu_srcs)
     set(hip_cu_srcs)
@@ -84,6 +83,16 @@ function(op_library TARGET)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
 
+    #remove windows unsupported op
+    if (WIN32)
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
+        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
+          return()
+        endif()
+    endforeach()
+    endif(WIN32)
+    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
+
     list(LENGTH op_library_DEPS op_library_DEPS_len)
     if (${op_library_DEPS_len} GREATER 0)
         set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
@@ -100,7 +109,8 @@ function(op_library TARGET)
     endif()
 
     # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
+"tensor_array_read_write_op" "tensorrt_engine_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -180,19 +190,19 @@ function(op_library TARGET)
 endfunction()
 
 add_subdirectory(math)
+if (NOT WIN32)
 add_subdirectory(nccl)
-
 if(WITH_GPU)
     op_library(nccl_op DEPS nccl_common)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
 else()
     set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
+endif() # NOT WIN32
 
 set(DISTRIBUTE_DEPS "")
 if(WITH_DISTRIBUTE)
     add_subdirectory(distributed)
-    
     set(DISTRIBUTE_DEPS "")
     if(WITH_GRPC)
         set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
@@ -221,7 +231,7 @@ if(WITH_DISTRIBUTE)
     #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
     #        listen_and_serv_op sum_op executor SERIAL)
-    if(WITH_GPU)
+    if(WITH_GPU AND NOT WIN32)
         set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
         cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
         if(WITH_GRPC)
@@ -232,7 +242,7 @@ if(WITH_DISTRIBUTE)
         set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     else()
         set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
-    endif()
+    endif() # WITH_GPU AND NOT WIN32
 else()
     set(DEPS_OPS ${DEPS_OPS}  checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
@@ -248,6 +258,7 @@ op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
     op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
     nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
       DEPS tensorrt_engine_op
       analysis)
@@ -329,5 +340,7 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
+if(NOT WIN32)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index d3a7ceed466a9b5e4d773f1531d198adff97eac2..27487b396ccf63d962defa6b270063ccb409164e 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -26,8 +26,6 @@ namespace plat = paddle::platform;
       act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
                                                  ops::grad_functor<float>>, \
       ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>,                 \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<plat::float16>>);
+                                ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 48f3b5a5bc06fbc211895a1a6d1521cfd97e0086..912415192659dc004f54a76e9cd1a20581d512a6 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -333,7 +333,8 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0.5) * dout / out;
+    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
   }
 };
 
@@ -739,7 +740,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor) - static_cast<T>(1));
+                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
   }
 };
 
@@ -862,11 +863,10 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    T b = static_cast<T>(beta);
     auto temp1 = static_cast<T>(1) /
-                 (static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
-    dx.device(d) = dout * ((b * out) + temp2);
+                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
+    dx.device(d) = dout * ((beta * out) + temp2);
   }
 };
 
diff --git a/paddle/fluid/operators/assign_value_op.cu.cc b/paddle/fluid/operators/assign_value_op.cu.cc
index 0ff174b3884df63d54d6486b017cc1a15ab23103..08bfde5dc92de9c675e5b9b85f8e65a3bab8631c 100644
--- a/paddle/fluid/operators/assign_value_op.cu.cc
+++ b/paddle/fluid/operators/assign_value_op.cu.cc
@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/assign_value_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
-                        ops::AssignValueKernel<float>,
-                        ops::AssignValueKernel<plat::float16>);
+                        ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 580fde753816c30b188b8a99cc63fcbafde64e25..135254ce6b6bf9add7bb1f0c3f645ed47081fba4 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -29,9 +29,9 @@ class ConditionalOp : public framework::OperatorBase {
 
  protected:
   std::vector<const framework::LoDTensor *> InputTensors(
-      const framework::Scope &scope) const {
+      const framework::Scope &scope, const std::string &in_name) const {
     std::vector<const framework::LoDTensor *> retv;
-    auto xs = Inputs("X");
+    auto xs = Inputs(in_name);
     retv.resize(xs.size(), nullptr);
     std::transform(
         xs.begin(), xs.end(), retv.begin(),
@@ -81,12 +81,18 @@ class ConditionalBlockOp : public ConditionalOp {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-    auto xs = InputTensors(scope);
-
     bool need_run;
     if (Attr<bool>("is_scalar_condition")) {
+      // When is_scalar_condition is True, the conditional variable is a scalar,
+      // whether need to execute the operators in sub-block depends on the
+      // conditional variable (Cond).
+      auto xs = InputTensors(scope, "Cond");
       need_run = ScalarCondition(xs);
     } else {
+      // When is_scalar_condition is False, the conditional variable maybe a
+      // vector or tensor, whether need to execute the operators in sub-block
+      // depends on the input variables (Input).
+      auto xs = InputTensors(scope, "Input");
       need_run = std::all_of(
           xs.begin(), xs.end(),
           [](const framework::LoDTensor *t) { return t->numel() != 0; });
@@ -110,11 +116,11 @@ class ConditionalBlockOp : public ConditionalOp {
 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X",
-             "The conditional variable of this operator. If X is empty, the "
+    AddInput("Cond",
+             "The conditional variable of this operator. If Cond is empty, the "
              "whole sub-block will not be executed.")
         .AsDuplicable();
-    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddInput("Input", "The input variables of the sub-block.").AsDuplicable();
     AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
     AddOutput("Scope",
               "(std::vector<Scope*>) The step scope of conditional block. To "
@@ -123,13 +129,18 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<framework::BlockDesc *>(
         "sub_block", "The step block of conditional block operator");
     AddAttr<bool>("is_scalar_condition",
-                  "the input X is used as scalar "
-                  "condition")
+                  "The conditional variable (Cond) is used as scalar "
+                  "condition.")
         .SetDefault(false);
     AddComment(R"DOC(Conditional block operator
 
-Run the sub-block if X is not empty. Params is the other inputs and Out is the
-outputs of the sub-block.
+If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar,
+run the operators in sub-block if Cond is True.
+
+If `is_scalar_condition` is False, the conditional variable (Cond) is a vector or
+tensor, run the operators in sub-block if all of input variables are not empty.
+
+
 )DOC");
   }
 };
@@ -145,12 +156,12 @@ class ConditionalBlockGradOp : public ConditionalOp {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-    auto xs = this->InputTensors(scope);
-
     bool need_run;
     if (Attr<bool>("is_scalar_condition")) {
+      auto xs = this->InputTensors(scope, "Cond");
       need_run = ScalarCondition(xs);
     } else {
+      auto xs = this->InputTensors(scope, "Input");
       need_run = std::all_of(
           xs.begin(), xs.end(),
           [](const framework::LoDTensor *t) { return t->numel() != 0; });
@@ -166,11 +177,11 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto *block = Attr<framework::BlockDesc *>("sub_block");
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
 
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
-                                  Outputs(framework::GradVarName("Params")));
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"),
+                                  Outputs(framework::GradVarName("Input")));
 
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
-                                  Outputs(framework::GradVarName("X")));
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"),
+                                  Outputs(framework::GradVarName("Cond")));
     }
   }
 
@@ -199,15 +210,15 @@ class ConditionalBlockGradOp : public ConditionalOp {
 class ConditionalBlockGradInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInputs("X"));
-    if (context->HasInputs("Params")) {
-      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
-      context->SetOutputsDim(framework::GradVarName("Params"),
-                             context->GetInputsDim("Params"));
+    PADDLE_ENFORCE(context->HasInputs("Cond"));
+    if (context->HasInputs("Input")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Input")));
+      context->SetOutputsDim(framework::GradVarName("Input"),
+                             context->GetInputsDim("Input"));
     }
-    if (context->HasOutputs(framework::GradVarName("X"))) {
-      context->SetOutputsDim(framework::GradVarName("X"),
-                             context->GetInputsDim("X"));
+    if (context->HasOutputs(framework::GradVarName("Cond"))) {
+      context->SetOutputsDim(framework::GradVarName("Cond"),
+                             context->GetInputsDim("Cond"));
     }
   }
 };
@@ -220,14 +231,15 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto grad_op = new framework::OpDesc();
     grad_op->SetType("conditional_block_grad");
-    grad_op->SetInput("X", Input("X"));
-    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Cond", Input("Cond"));
+    grad_op->SetInput("Input", Input("Input"));
     grad_op->SetInput("Out", Output("Out"));
     grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetInput("Scope", Output("Scope"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
-    grad_op->SetOutput(framework::GradVarName("Params"),
-                       InputGrad("Params", false));
+    grad_op->SetOutput(framework::GradVarName("Cond"),
+                       InputGrad("Cond", false));
+    grad_op->SetOutput(framework::GradVarName("Input"),
+                       InputGrad("Input", false));
     grad_op->SetBlockAttr("sub_block", this->grad_block_[0]);
     grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
     return std::unique_ptr<framework::OpDesc>(grad_op);
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 59bfe8f61d8ebb530ba617006650c0ef9215e2a6..22cbf680c0670552fb014043c69fcadc56863529 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -39,27 +39,6 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
     static_cast<size_t>(1024) * 1024 * 1024;
 
-template <typename T, typename DeviceContext>
-// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
-bool EnableFp16(const DeviceContext& dev_ctx,
-                cudnnConvolutionDescriptor_t cudnn_conv_desc) {
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  // Tensor core is supported since the volta GPU and
-  // is only enabled when input and filter data are float16
-  if (dev_ctx.GetComputeCapability() >= 70 &&
-      std::type_index(typeid(T)) ==
-          std::type_index(typeid(platform::float16))) {
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
-    return true;
-  } else {
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-  }
-#endif
-  return false;
-}
-
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
  public:
@@ -149,14 +128,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionFwdAlgo_t algo;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
-    if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else {
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &algo));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
     }
+#endif
 
     // get workspace size able to allocate
     CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@@ -296,9 +288,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else {
         data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
-      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
-        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-      }
 
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@@ -318,9 +307,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else {
         filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       }
-      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
-        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
-      }
 
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@@ -376,8 +362,7 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
 
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
@@ -385,5 +370,4 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index f07ab5a33b87d7945e5fcdf8f3644f0711ce643b..527a87db533ac25c3170fbb3ae6a9b9aff589b3d 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -126,6 +126,15 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
                                pipeline);
   }
 
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
+    auto bias_pd = conv_pd_->bias_primitive_desc();
+    return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
+                               "@bias_mem_p", pipeline);
+  }
+
   std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
       std::shared_ptr<mkldnn::memory> src_memory_p,
       std::shared_ptr<mkldnn::memory> weights_memory_p,
@@ -147,6 +156,28 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
     return conv_p;
   }
 
+  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> bias_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<mkldnn::convolution_forward>(
+          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
+          *(bias_memory_p.get()), *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
   std::shared_ptr<mkldnn::convolution_backward_weights>
   AcquireConvolutionBackwardWeights(
       std::shared_ptr<mkldnn::memory> src_memory_p,
@@ -229,6 +260,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
+    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
     PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
@@ -237,6 +269,17 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
                        filter->format() != memory::format::format_undef,
                    "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4,
+                   "Filter must be with 4 dimensions, i.e. OIHW");
+    if (bias) {
+      PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
+                         bias->format() != memory::format::format_undef,
+                     "Wrong layout/format set for Bias tensor");
+      PADDLE_ENFORCE(bias->dims().size() == 1,
+                     "Bias must only have 1 dimension, i.e. X");
+    }
 
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -253,11 +296,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const T* filter_data = filter->data<T>();
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
 
-    PADDLE_ENFORCE(input->dims().size() == 4,
-                   "Input must be with 4 dimensions, i.e. NCHW");
-    PADDLE_ENFORCE(filter->dims().size() == 4,
-                   "Filter must be with 4 dimensions, i.e. OIHW");
-
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> weights_tz =
         paddle::framework::vectorize2int(filter->dims());
@@ -288,13 +326,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
         weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+    std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
+                               // Currently used whenever bias is != nullptr.
     auto dst_md = platform::MKLDNNMemDesc(
         dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
     // create a conv primitive descriptor and save it for usage in backward
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
-        ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
-                             mkldnn_engine);
+    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
+    if (bias) {
+      bias_tz = paddle::framework::vectorize2int(bias->dims());
+      auto bias_md = platform::MKLDNNMemDesc(
+          bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
+      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
+                                     strides, paddings, mkldnn_engine);
+    } else {
+      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
+                                     paddings, mkldnn_engine);
+    }
     // Save conv_pd/src_memory/weights_memory for backward pass
     dev_ctx.SetBlob(key_conv_pd, conv_pd);
 
@@ -315,8 +363,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
 
     // create convolution op primitive
-    auto conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
-                                             dst_memory_p);
+    std::shared_ptr<mkldnn::convolution_forward> conv_p;
+    if (bias) {
+      const T* bias_data = bias->data<T>();
+      auto user_bias_md = platform::MKLDNNMemDesc(
+          {bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x);
+      auto user_bias_memory_p =
+          handler.AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
+
+      auto bias_memory_p =
+          handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          bias_memory_p, dst_memory_p);
+    } else {
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          dst_memory_p);
+    }
 
     // push primitive to stream and wait until it's executed
     pipeline.push_back(*conv_p);
@@ -346,6 +408,27 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
         p_conv_pd);
   }
+
+  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
+                       const memory::desc& bias, const memory::desc& dst,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const mkldnn::engine& engine) const {
+    memory::dims stride_dims = {strides[0], strides[1]};
+    memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto conv_desc = mkldnn::convolution_forward::desc(
+        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        bias, dst, stride_dims, padding_dims, padding_dims,
+        mkldnn::padding_kind::zero);
+
+    auto p_conv_pd =
+        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
+
+    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+        p_conv_pd);
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 37153d58439a90190eb2ad82d5dcc145e22dfa48..61ca80877a6dfcdf30a0ff346342116e36eec6f2 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -37,6 +37,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 
   auto in_dims = ctx->GetInputDim("Input");
   auto filter_dims = ctx->GetInputDim("Filter");
+
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   int groups = ctx->Attrs().Get<int>("groups");
@@ -57,7 +58,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
                     "The number of input channels should be equal to filter "
                     "channels * groups.");
-
   PADDLE_ENFORCE_EQ(
       filter_dims[0] % groups, 0,
       "The number of output channels should be divided by groups.");
@@ -122,6 +122,11 @@ void Conv2DOpMaker::Make() {
            "H is the height of the filter, and W is the width of the filter. "
            "If the groups attribute is greater than 1, C equals the number of "
            "input image channels divided by the groups.");
+  AddInput("Bias",
+           "(Tensor) Bias to be added to each output of filter application."
+           "The format of output tensor is X (one-dimensional) of size equal"
+           "to the number of output channels. Only used with MKL-DNN.")
+      .AsDispensable();
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator. "
             "The format of output tensor is also NCHW.")
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 3f5fab3b382bea97f43e4bc1b2cd436c956ba264..8181897c3d3844bda5574e85a08b2af038fcd664 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -85,6 +85,199 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     int* track_value =
         track.mutable_data<int>(emission_dims, platform::CPUPlace());
 
+#ifdef __AVX__
+// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
+// 16 elements per iteration. Then it can implement the parallel processing.
+// Only optimize for float type.
+#ifdef __AVX512F__
+    size_t step_size = 16;
+#else
+    size_t step_size = 8;
+#endif
+    if (std::is_same<T, float>::value && (tag_num >= step_size)) {
+      size_t steps = tag_num / step_size;
+      size_t remain = tag_num % step_size;
+      int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
+
+      // Setup the alpha initial value.
+      size_t i_offset = 0;
+      for (size_t i = 0; i <= steps; ++i) {
+#ifdef __AVX512F__
+        // Declare the variable for the content of weights, input and alpha
+        // values.
+        __m512 w_content, x_content, alpha_content;
+
+        // Load the relevant data into the variables from un-aligned address.
+        w_content = _mm512_loadu_ps((const float*)(w + i_offset));
+        x_content = _mm512_loadu_ps((const float*)(x + i_offset));
+        alpha_content = _mm512_add_ps(w_content, x_content);
+
+        // Save the alpha value.
+        _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
+                         alpha_content);
+#else
+        // Declare the variable for the content of weights, input and alpha
+        // values.
+        __m256 w_content, x_content, alpha_content;
+
+        // Load the relevant data into the variables from un-aligned address.
+        w_content = _mm256_loadu_ps((const float*)(w + i_offset));
+        x_content = _mm256_loadu_ps((const float*)(x + i_offset));
+        alpha_content = _mm256_add_ps(w_content, x_content);
+
+        // Save the alpha value.
+        _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
+                         alpha_content);
+#endif
+        i_offset += step_size;
+        if (i == steps - 1) {
+          if (remain > 0) {
+            i_offset += last_offset;
+          } else {
+            break;
+          }
+        }
+      }
+
+      // Use the column-major strategy to get the location of maximum score.
+      size_t seq_offset = 0;
+      for (size_t k = 1; k < seq_len; ++k) {
+        size_t j_offset = 0;
+        for (size_t j = 0; j <= steps; ++j) {
+#ifdef __AVX512F__
+          // Initialize the variables of maximum score and location.
+          __m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
+          __m512i max_j = _mm512_setzero_si512();
+#else
+          // Initialize the variables of maximum score and location.
+          __m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
+          __m256i max_j = _mm256_set1_epi32(0);
+#endif
+          // Calculate the offset of transition_weights.
+          size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
+          for (size_t i = 0; i < tag_num; ++i) {
+#ifdef __AVX512F__
+            // Initalize the content of alpha variable with related offset.
+            __m512 alpha_content =
+                _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
+            // Obtain the content of weights from un-aligned address.
+            __m512 w_content =
+                _mm512_loadu_ps((const float*)(w + trans_offset));
+
+            __m512 score_v = _mm512_add_ps(alpha_content, w_content);
+
+            __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
+
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm512_mask_set1_epi32(max_j, mask, i);
+
+            // Update the max_score value.
+            max_score = _mm512_max_ps(max_score, score_v);
+#else
+            // Initalize the content of alpha variable with related offset.
+            __m256 alpha_content = _mm256_broadcast_ss(
+                (const float*)(alpha_value + seq_offset + i));
+            // Obtain the content of weights from un-aligned address.
+            __m256 w_content =
+                _mm256_loadu_ps((const float*)(w + trans_offset));
+            __m256 score_v = _mm256_add_ps(alpha_content, w_content);
+
+            __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
+
+#ifdef __AVX2__
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm256_or_si256(
+                _mm256_andnot_si256((__m256i)mask, max_j),
+                _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
+#else
+            __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
+            __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
+            __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
+            __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
+
+            lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
+            hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
+            lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
+            hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
+
+            lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
+            hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
+
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
+            max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
+#endif
+
+            // Update the max_score value.
+            max_score = _mm256_max_ps(max_score, score_v);
+#endif
+            trans_offset += tag_num;
+          }
+
+#ifdef __AVX512F__
+          // Update the alpha and track values.
+          __m512 x_content = _mm512_loadu_ps(
+              (const float*)(x + seq_offset + tag_num + j_offset));
+          max_score = _mm512_add_ps(max_score, x_content);
+          _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
+                                                    tag_num + j_offset),
+                           max_score);
+          _mm512_storeu_si512(
+              reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
+                                         j_offset),
+              max_j);
+#else
+          // Update the alpha and track values.
+          __m256 x_content = _mm256_loadu_ps(
+              (const float*)(x + seq_offset + tag_num + j_offset));
+          max_score = _mm256_add_ps(max_score, x_content);
+          _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
+                                                    tag_num + j_offset),
+                           max_score);
+          _mm256_storeu_si256(
+              reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
+                                         j_offset),
+              max_j);
+#endif
+
+          // Calculate the offset of next step
+          j_offset += step_size;
+          if (j == steps - 1) {
+            if (remain > 0) {
+              j_offset += last_offset;
+            } else {
+              break;
+            }
+          }
+        }
+
+        seq_offset += tag_num;
+      }
+    } else {
+      for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
+
+      for (size_t k = 1; k < seq_len; ++k) {
+        for (size_t i = 0; i < tag_num; ++i) {
+          T max_score = -std::numeric_limits<T>::max();
+          int max_j = 0;
+          for (size_t j = 0; j < tag_num; ++j) {
+            T score = alpha_value[(k - 1) * tag_num + j] +
+                      w[(j + state_trans_base_idx) * tag_num + i];
+            if (score > max_score) {
+              max_score = score;
+              max_j = j;
+            }
+          }
+
+          alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
+          track_value[k * tag_num + i] = max_j;
+        }
+      }
+    }
+#else
     for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
 
     for (size_t k = 1; k < seq_len; ++k) {
@@ -105,6 +298,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
       }
     }
 
+#endif
     T max_score = -std::numeric_limits<T>::max();
     int max_i = 0;
     for (size_t i = 0; i < tag_num; ++i) {
diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu
index 65fd3a5dbc9ffed4c5d1114346fcc0660c183dae..30dbd5bd3d39dd2992c3dd91364003bb7715a2eb 100644
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 using CUDACtx = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(cross_entropy,
                         ops::CrossEntropyOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel<CUDACtx, double>,
-                        ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
+                        ops::CrossEntropyOpKernel<CUDACtx, double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index de1a503154deb967eb4389a9f43b86c05626d966..66784f0b5149a7c479a90a407709d993f4a40a8b 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -130,12 +130,13 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
       checkpoint_notify_id != -1,
       "when checkpoint_notify_id = -1, there should be no RPC invoke.");
 
-  auto* lt_var = scope->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+  // TODO(tangwei12): find out why scope will be error.
+  auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
   lt_var->clear();
   lt_var->append(out_var_name);
   VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
           << out_var_name;
-  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope);
+  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_);
   return true;
 }
 
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index b50830c362d3f6ecf38affbfa6a1ffe2ed77e125..d6176e1443d2a441af7878e5efe99796d486bb7a 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -78,10 +78,9 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
                          int64_t rows_numel) {
   CreateVarsOnScope(scope, place);
   auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
-  auto rows = w->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
   auto w_value = w->mutable_value();
   w_value->Resize({rows_numel, 10});
+  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
 
   auto ptr = w_value->mutable_data<float>(*place);
 
diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu
index f9f5c66d34fa1d73db00173e493f9953b8579518..dfff518f170b56d180b6883c363effb8dbd677b6 100644
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -30,5 +30,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise_div_op.cu b/paddle/fluid/operators/elementwise_div_op.cu
index 4cc7ba0f43c6031bf4a27222a17eca84bad5a668..588d1f7420241ba1697e5141e4e4a2870f2dc87c 100644
--- a/paddle/fluid/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
@@ -14,24 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_div_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>);
+                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise_mul_op.cu
index 350d43168dea7e88127b0d28d663e680458e1dba..2fb1b4bee689c059625e3dbd59f80c541ace83a0 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
@@ -14,25 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_mul_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h
index 82c5fa0472bcc3b4d2d12b7a80c3418da5d6dd7b..4437da4d95f97b5cbbca1650badf9710c26b4380 100644
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+
 namespace paddle {
 namespace operators {
 
@@ -23,6 +25,37 @@ struct MulFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
 };
 
+template <typename DeviceContext, typename T>
+void default_elementwise_mul(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y, framework::Tensor* z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        MulFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+  blas.VMUL(x->numel(), x->data<T>(), y->data<T>(),
+            z->mutable_data<T>(ctx.GetPlace()));
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value ||
+    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+}
+
 template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
@@ -33,9 +66,11 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     auto* y = ctx.Input<Tensor>("Y");
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          MulFunctor<T>(), z);
+    if (x->numel() == y->numel()) {
+      elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    } else {
+      default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 7223a972d23119c8ef93fb49bfe42922cc14571d..f90dcdc156590b776f817a4933d5a9b45868ba98 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -80,6 +80,9 @@ inline framework::DDim trim_trailing_singular_dims(
   for (int i = 0; i < actual_dims_size; ++i) {
     trim_dims[i] = dims[i];
   }
+  if (trim_dims.size() == 0) {
+    return framework::DDim(framework::make_dim());
+  }
   framework::DDim actual_dims = framework::make_ddim(trim_dims);
   return actual_dims;
 }
@@ -350,7 +353,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
   int j = blockIdx.x;
   int i = threadIdx.x;
   int tid = threadIdx.x;
-  T val(0);
+  T val = 0;
 
   do {
     int x_offset = i * w + j;
@@ -418,7 +421,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
   int tid = threadIdx.x;
   int j = blockIdx.x;
 
-  T val(0);
+  T val = 0;
   int ttid = tid;
 
   while (true) {
diff --git a/paddle/fluid/operators/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise_sub_op.cu
index ff3f6f8a2cb542c2fb6b43d539f6413b39250992..8709f686f9af1bf4dacbc2dfc3e2d5dcc1c59b9a 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
@@ -14,25 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_sub_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub_grad,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 099ca52c8e945a0e93c2f13adb612158c67397cf..fa4dec9cf118cef9b836943fd4eae90d23e6218a 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fc_op.h"
 #include <vector>
 #include "paddle/fluid/operators/math/blas.h"
-
-DECLARE_int32(paddle_num_threads);
+#include "paddle/fluid/operators/math/fc_compute.h"
 
 namespace paddle {
 namespace operators {
@@ -36,9 +35,14 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
 
   if (ctx->HasInput("Bias")) {
     auto bias_dims = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim].");
-    PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
-                      "The shape of Bias must be [1, dim].");
+    if (bias_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim].");
+      PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
+                        "The shape of Bias must be [1, dim].");
+    } else if (bias_dims.size() == 1) {
+      PADDLE_ENFORCE_EQ(bias_dims[0], w_dims[1],
+                        "The shape of Bias must be [1, dim].");
+    }
   }
   PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
                  "Fully Connected input should be 2-D or 4-D tensor.");
@@ -110,13 +114,8 @@ void FCOpMaker::Make() {
   AddComment(R"DOC(
   Fully Connected Operator.
 
-  The fully connected operation calculates the output based on the input, weights and bias attribute.
+  The fully connected operation calculates the output based on the input, weights and bias.
   The size of each dimension of the parameters checked in the infer-shape.
-  The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
-  Additional parametrs are use_mkldnn and bias_attr.
-  The input(X) size and output(Out) size may be diffrent.
-
-  The fully connected layer only supports MKLDNN version
 )DOC");
 }
 
@@ -133,26 +132,15 @@ class FCOpKernel : public framework::OpKernel<T> {
     auto in_dims = input->dims();
     auto w_dims = w->dims();
 
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(dev_ctx);
     const T* input_data = input->data<T>();
     const T* w_data = w->data<T>();
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    math::FCCompute<platform::CPUDeviceContext, T>(
+        blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data,
+        bias ? bias->data<T>() : NULL);
 
-    blas.GEMM(CblasNoTrans, CblasNoTrans, in_dims[0], w_dims[1], w_dims[0],
-              static_cast<T>(1), input_data, w_data, static_cast<T>(0),
-              output_data);
-
-    if (bias) {
-      const T* bias_data = bias->data<T>();
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
-#endif
-      for (int bs = 0; bs < in_dims[0]; bs++) {
-        blas.AXPY(w_dims[1], static_cast<T>(1), bias_data,
-                  output_data + bs * w_dims[1]);
-      }
-    }
+    // TODO(TJ): fuse act
   }
 };
 
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 862249269eaecdac262a691c884ea59f89f54061..130f18dde4f979a6a9925ede9cbf745fcec14d48 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -12,28 +12,48 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
 
-class FillConstantOp : public framework::OperatorWithKernel {
+class FillConstantInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
-    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
     ctx->SetOutputDim("Out", framework::make_ddim(shape));
   }
+};
+
+class FillConstantOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
+    auto value = Attr<float>("value");
+    auto force_cpu = Attr<bool>("force_cpu");
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    if (force_cpu) {
+      auto cpu = platform::CPUPlace();
+      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+    } else {
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
+    }
 
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    math::set_constant(dev_ctx, &out, value);
   }
 };
 
@@ -67,11 +87,6 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int64_t>)
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
deleted file mode 100644
index b2a2a7b2faedf9b94e01ed908ff39749973be1df..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_constant_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillConstantOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto value = ctx.Attr<float>("value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->Resize(framework::make_ddim(ctx.Attr<std::vector<int>>("shape")));
-    if (force_cpu) {
-      auto cpu = platform::CPUPlace();
-      out->mutable_data(cpu, framework::ToTypeIndex(data_type));
-    } else {
-      out->mutable_data(ctx.GetPlace(), framework::ToTypeIndex(data_type));
-    }
-
-    math::set_constant(ctx.template device_context<DeviceContext>(), out,
-                       value);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index 352a17c927bc70bdd6e4307951f0e0ac3d10ac2d..925dc19061e2196a40411f415eb6e5ad59ab52ff 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -70,6 +69,7 @@ class FillOp : public framework::OperatorBase {
 
     framework::VisitDataType(
         dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
+
     if (!force_cpu && platform::is_gpu_place(place)) {
       // Copy tensor to out
       platform::DeviceContextPool &pool =
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3888333ec5626f1d8d35db215085f483c985cf0a
--- /dev/null
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -0,0 +1,354 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fusion_lstm_op.h"
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+
+void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
+                 "Input(WeightX) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
+                 "Input(WeightH) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                 "Input(Bias) of LSTM should not be null.");
+
+  PADDLE_ENFORCE(ctx->HasOutput("XX"),
+                 "Output(XX) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                 "Output(Hidden) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                 "Output(Cell) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
+                 "Output(BatchedGate) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
+                 "Output(BatchedGate) of LSTM should not be null.");
+
+  auto x_dims = ctx->GetInputDim("X");
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+  if (ctx->HasInput("H0")) {
+    PADDLE_ENFORCE(ctx->HasInput("C0"),
+                   "Input(Cell) and Input(Hidden) of LSTM should not "
+                   "be null at the same time.");
+    auto h_dims = ctx->GetInputDim("H0");
+    auto c_dims = ctx->GetInputDim("C0");
+    PADDLE_ENFORCE(h_dims == c_dims,
+                   "The dimension of Input(H0) and Input(C0) "
+                   "should be the same.");
+  }
+
+  auto wx_dims = ctx->GetInputDim("WeightX");
+  PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
+                    "The rank of Input(WeightX) should be 2.");
+  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
+                    "The first dimension of Input(WeightX) "
+                    "should be %d.",
+                    x_dims[1]);
+
+  int frame_size = wx_dims[1] / 4;
+  auto wh_dims = ctx->GetInputDim("WeightH");
+  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
+                    "The rank of Input(WeightH) should be 2.");
+  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
+                    "The first dimension of Input(WeightH) "
+                    "should be %d.",
+                    frame_size);
+  PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
+                    "The second dimension of Input(WeightH) "
+                    "should be 4 * %d.",
+                    frame_size);
+
+  auto b_dims = ctx->GetInputDim("Bias");
+  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+  PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                    "The first dimension of Input(Bias) should be 1.");
+
+  PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"),
+                 "Do not support peephole yet.");
+  PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                    "The second dimension of Input(Bias) should be "
+                    "4 * %d if disable peepholes connection",
+                    frame_size);
+
+  framework::DDim out_dims({x_dims[0], frame_size});
+  ctx->SetOutputDim("Hidden", out_dims);
+  ctx->SetOutputDim("Cell", out_dims);
+  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
+  ctx->SetOutputDim("BatchCellPreAct", out_dims);
+  ctx->ShareLoD("X", "Hidden");
+  ctx->ShareLoD("X", "Cell");
+
+  int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->ShareLoD("X", "XX");
+}
+
+framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+      ctx.device_context());
+}
+
+void FusionLSTMOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) the input is a LodTensor, which support "
+           "variable-time length input sequence. The underlying tensor in "
+           "this LoDTensor is a matrix with shape (T X M), where T is the "
+           "total time steps in this mini-batch, M is the dim size of x.");
+  AddInput("WeightX",
+           "(Tensor) the learnable weights of X."
+           " - The shape is (M x 4D), where M is the dim size of x, D is the "
+           "hidden size. "
+           " - Weight = {W_cx, W_ix, W_fx, W_ox}");
+  AddInput("WeightH",
+           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
+           " - The shape is (D x 4D), where D is the hidden size. "
+           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+  AddInput("Bias",
+           "(Tensor) the learnable weights. Almost same as LSTMOp"
+           "Note: we should add the fc bias into this (1x4D) in bias."
+           "input-hidden bias weight and peephole connections weight if "
+           "setting `use_peepholes` True. "
+           "1. `use_peepholes = False` "
+           " - The shape is (1 x 4D). "
+           " - Bias = {b_c, b_i, b_f, b_o}."
+           "2. `use_peepholes = True` "
+           " - The shape is (1 x 7D). "
+           " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+  AddInput("H0",
+           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size and D is the hidden size.")
+      .AsDispensable();
+  AddInput("C0",
+           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size. `H0` and `C0` can be NULL but only at the same time.")
+      .AsDispensable();
+  AddOutput("Hidden",
+            "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("Cell",
+            "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("XX",
+            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
+            " or batched_X (size is T x M), this will be automatically chosen,"
+            " where T is the total time steps in this mini-batch,"
+            " D is the hidden size, M is the dim size of x input.")
+      .AsIntermediate();
+  AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate();
+  AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).")
+      .AsIntermediate();
+  AddAttr<bool>("use_peepholes",
+                "(bool, defalut: True) "
+                "whether to enable diagonal/peephole connections.")
+      .SetDefault(true);
+  AddAttr<bool>("is_reverse",
+                "(bool, defalut: False) "
+                "whether to compute reversed LSTM.")
+      .SetDefault(false);
+  AddAttr<std::string>("gate_activation",
+                       "(string, default: sigmoid)"
+                       "The activation for input gate, forget gate and output "
+                       "gate, `sigmoid` by default.")
+      .SetDefault("sigmoid")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("cell_activation",
+                       "(string, default: tanh)"
+                       "The activation for cell output, `tanh` by defalut.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("candidate_activation",
+                       "(string, default: tanh)"
+                       "The activation for candidate hidden state, "
+                       "`tanh` by default.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddComment(R"DOC(
+Fusion Long-Short Term Memory (LSTM) Operator.
+This operator fuse the X into LSTM, more details can refer to LSTM op.
+)DOC");
+}
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  // TODO(TJ): check mem copy perf
+  row_shuffle(ctx, src, index_lod, dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class FuisonLSTMKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* wx = ctx.Input<Tensor>("WeightX");
+    auto* wh = ctx.Input<Tensor>("WeightH");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
+    auto* xx = ctx.Output<LoDTensor>("XX");
+    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    const T* x_data = x->data<T>();
+    const T* wx_data = wx->data<T>();
+    auto x_dims = x->dims();
+    auto wx_dims = wx->dims();
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    if (x_dims[1] > wx_dims[1]) {
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
+                                        x_data, wx_data, xx_data,
+                                        bias->data<T>());
+      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+    } else {
+      to_batch(dev_ctx, *x, xx, true, is_reverse);
+      batched_gate->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
+                                        xx_data, wx_data, batched_gate_data,
+                                        bias->data<T>());
+    }
+
+    int frame_size = static_cast<int>(wx_dims[1] / 4);
+    framework::DDim out_dims({x_dims[0], frame_size});
+    math::LstmMetaValue<T> lstm_value;
+    // no peephole
+    lstm_value.check_ig = nullptr;
+    lstm_value.check_fg = nullptr;
+    lstm_value.check_og = nullptr;
+    lstm_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+
+    framework::Vector<size_t> order(batched_gate->lod()[2]);
+
+    if (cell_t0) {
+      // Since the batch computing for LSTM reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0,
+                                         true);
+      lstm_value.prev_state_value = ordered_c0.data<T>();
+    }
+
+    // Use the local variable as here.
+    LoDTensor batch_hidden, batch_cell;
+    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
+    batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
+    batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
+    batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace());
+
+    auto batch_starts = batched_gate->lod()[0];
+    size_t max_seq_len = batch_starts.size() - 1;
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+
+    for (size_t n = 0; n < max_seq_len; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batched_gate->Slice(bstart, bend);
+      Tensor out_t = batch_hidden.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
+        // TODO(TJ): use gemm directly
+        blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // TODO(TJ): move h0 outside for
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTM reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+        Tensor ordered_h0;
+        ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        // TODO(TJ): use gemm directly
+        blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
+                    static_cast<T>(1.0));
+      }
+
+      lstm_value.gate_value = gate_t.data<T>();
+      lstm_value.output_value = out_t.data<T>();
+      lstm_value.state_value = cell_t.data<T>();
+      lstm_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act,
+          cand_act);
+      lstm_value.prev_state_value = lstm_value.state_value;
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden.set_lod(batched_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(dev_ctx, batch_hidden, hidden_out);
+
+    batch_cell.set_lod(batched_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(dev_ctx, batch_cell, cell_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OP_CPU_KERNEL(
+    fusion_lstm,
+    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/fusion_lstm_op.h b/paddle/fluid/operators/fusion_lstm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..39dc09b4d116193399d8ac9a51e88dbc3e239918
--- /dev/null
+++ b/paddle/fluid/operators/fusion_lstm_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+// #include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class FusionLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index b4907237954ba478197d5ca8bdcbc3e1915e9dcf..7784856417e579fd43f79fa331d46df8af6c36b8 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -61,7 +60,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(gaussian_random,
                         paddle::operators::GPUGaussianRandomKernel<float>,
                         paddle::operators::GPUGaussianRandomKernel<double>);
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 5c746878823b3dcde2573feec00d3d9dac5ceab8..087f903a8bba9a4bfcd7eaabd7098555442a904e 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -14,6 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
 #include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace operators {
@@ -211,6 +216,158 @@ class GRUGradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class GRUCPUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    auto hidden_dims = hidden->dims();
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
+
+    if (bias) {
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
+    }
+
+    int frame_size = hidden_dims[1];
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), *h0, order,
+          &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batch_gate->lod()[0];
+    size_t seq_len = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
+
+#ifdef PADDLE_WITH_MKLML
+    // use MKL packed to speedup GEMM
+    if (FLAGS_paddle_num_threads >= 4) {
+      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
+                                       frame_size * 2 /*width of weight*/,
+                                       frame_size /*height of height*/);
+      PADDLE_ENFORCE(packed_gate);
+      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
+                     frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
+                     packed_gate);
+      T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
+                                        frame_size /*width of weight*/,
+                                        frame_size /*height of height*/);
+      PADDLE_ENFORCE(packed_state);
+      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
+                     frame_size, T(1.0), gru_value.state_weight, frame_size,
+                     packed_state);
+      for (size_t n = 0; n < seq_len; n++) {
+        int bstart = static_cast<int>(batch_starts[n]);
+        int bend = static_cast<int>(batch_starts[n + 1]);
+        int cur_batch_size = bend - bstart;
+
+        Tensor gate_t = batch_gate->Slice(bstart, bend);
+        Tensor reset_hidden_prev_t =
+            batch_reset_hidden_prev->Slice(bstart, bend);
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        gru_value.output_value = hidden_t.data<T>();
+        gru_value.gate_value = gate_t.data<T>();
+        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+
+        if (gru_value.prev_out_value) {
+          blas.GEMM_COMPUTE(
+              CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2,
+              frame_size, gru_value.prev_out_value, frame_size, packed_gate,
+              frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
+        }
+
+        math::detail::forward_reset_output(
+            math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
+            cur_batch_size, active_gate);
+
+        if (gru_value.prev_out_value) {
+          blas.GEMM_COMPUTE(
+              CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
+              gru_value.reset_output_value, frame_size, packed_state,
+              frame_size, T(1), gru_value.gate_value + frame_size * 2,
+              frame_size * 3);
+        }
+
+        math::detail::forward_final_output(
+            math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
+            cur_batch_size, active_node);
+
+        gru_value.prev_out_value = gru_value.output_value;
+      }
+
+      blas.GEMM_FREE(packed_gate);
+      blas.GEMM_FREE(packed_state);
+    } else {
+#endif
+      for (size_t n = 0; n < seq_len; n++) {
+        int bstart = static_cast<int>(batch_starts[n]);
+        int bend = static_cast<int>(batch_starts[n + 1]);
+        int cur_batch_size = bend - bstart;
+
+        Tensor gate_t = batch_gate->Slice(bstart, bend);
+        Tensor reset_hidden_prev_t =
+            batch_reset_hidden_prev->Slice(bstart, bend);
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        gru_value.output_value = hidden_t.data<T>();
+        gru_value.gate_value = gate_t.data<T>();
+        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+
+        math::GRUUnitFunctor<DeviceContext, T>::compute(
+            dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+            active_gate);
+
+        gru_value.prev_out_value = gru_value.output_value;
+      }
+#ifdef PADDLE_WITH_MKLML
+    }
+#endif
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(dev_ctx, *batch_hidden, hidden);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -218,9 +375,8 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(gru, ops::GRUOp, ops::GRUOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(gru_grad, ops::GRUGradOp);
-REGISTER_OP_CPU_KERNEL(
-    gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(gru, ops::GRUCPUKernel<float>,
+                       ops::GRUCPUKernel<double>);
 REGISTER_OP_CPU_KERNEL(
     gru_grad, ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index baf455a840314d1ab94eb8e0a2e5c660ba4202da..55721c283dd18c2f9642563a9ce1eabfce16fd7b 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -14,6 +14,96 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class GRUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    auto hidden_dims = hidden->dims();
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
+
+    if (bias) {
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
+    }
+
+    int frame_size = hidden_dims[1];
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), *h0, order,
+          &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      gru_value.output_value = hidden_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+      math::GRUUnitFunctor<DeviceContext, T>::compute(
+          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+          active_gate);
+      gru_value.prev_out_value = gru_value.output_value;
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(dev_ctx, *batch_hidden, hidden);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     gru, ops::GRUKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 3b0d93e54b72910de1429ddf41eb6b0fe9646942..0b551e8046be16c95f7d6b10b68b32a9af594f73 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -37,90 +37,6 @@ inline void ReorderInitState(const DeviceContext& ctx,
   row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
 
-template <typename DeviceContext, typename T>
-class GRUKernel : public framework::OpKernel<T> {
- public:
-  void BatchCompute(const framework::ExecutionContext& context) const {
-    auto* input = context.Input<LoDTensor>("Input");
-    auto* h0 = context.Input<Tensor>("H0");
-    auto* weight = context.Input<Tensor>("Weight");
-    const T* weight_data = weight->data<T>();
-    auto* bias = context.Input<Tensor>("Bias");
-    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
-    batch_gate->mutable_data<T>(context.GetPlace());
-    auto* batch_reset_hidden_prev =
-        context.Output<LoDTensor>("BatchResetHiddenPrev");
-    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
-    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
-    batch_hidden->mutable_data<T>(context.GetPlace());
-    auto* hidden = context.Output<LoDTensor>("Hidden");
-    hidden->mutable_data<T>(context.GetPlace());
-
-    auto hidden_dims = hidden->dims();
-
-    bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
-
-    if (bias) {
-      math::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
-    }
-
-    int frame_size = hidden_dims[1];
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(weight_data);
-    gru_value.state_weight =
-        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
-
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (h0) {
-      // Since the batch computing for GRU reorders the input sequences
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), *h0, order,
-          &ordered_h0, true);
-      gru_value.prev_out_value = ordered_h0.data<T>();
-    } else {
-      gru_value.prev_out_value = nullptr;
-    }
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
-        context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        context.Attr<std::string>("gate_activation"));
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-      int cur_batch_size = bend - bstart;
-
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      gru_value.output_value = hidden_t.data<T>();
-      gru_value.gate_value = gate_t.data<T>();
-      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-      math::GRUUnitFunctor<DeviceContext, T>::compute(
-          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-          active_gate);
-      gru_value.prev_out_value = gru_value.output_value;
-    }
-
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden->set_lod(batch_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, hidden);
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    BatchCompute(context);
-  }
-};
-
 template <typename DeviceContext, typename T>
 class GRUGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 27e26cb1b5c1e831f05dac299489628b92eaa58c..51219504ffa2a778b56351f759e8a8dfb951ad91 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -92,6 +92,7 @@ class LoadOp : public framework::OperatorBase {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
     framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
+    selectedRows->SyncIndex();
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
index 2ce11e712fb1a8aa9748313ec7cf4e895a931465..de3f0990e109cacd49c4d888bbc1f797fb196e01 100644
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -46,10 +45,6 @@ class LookupSparseTableOp : public framework::OperatorBase {
     auto out_var = scope.FindVar(Output("Out"));
     auto w_var = scope.FindVar(Input("W"));
     auto ids_var = scope.FindVar(Input("Ids"));
-    unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
-    float min = Attr<float>("min");
-    float max = Attr<float>("max");
-    bool auto_grown_table = Attr<bool>("auto_grown_table");
 
     PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
                    "The type of Out var should be LodTensor.");
@@ -60,46 +55,17 @@ class LookupSparseTableOp : public framework::OperatorBase {
     auto &ids_t = ids_var->Get<framework::LoDTensor>();
     auto out_t = out_var->GetMutable<framework::LoDTensor>();
     auto w_t = w_var->GetMutable<framework::SelectedRows>();
-    std::vector<int64_t> keys;
-    keys.resize(ids_t.numel());
-    for (int64_t i = 0; i < ids_t.numel(); ++i) {
-      keys[i] = ids_t.data<int64_t>()[i];
-    }
 
     // TODO(Yancey1989): support CUDA Place for the sparse table
     platform::CPUPlace cpu;
     auto out_shape = w_t->value().dims();
-    out_shape[0] = keys.size();
+    out_shape[0] = ids_t.numel();
     out_t->Resize(out_shape);
     out_t->mutable_data(cpu, w_t->value().type());
     PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
                       framework::proto::VarType::FP32,
                       "The sparse table only support FP32");
-    auto non_keys_pair = w_t->Get(keys, out_t);
-    if (!auto_grown_table) {
-      PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast<size_t>(0),
-                        "there is some keys does exists in the sparse table.");
-    }
-    auto value_shape = w_t->value().dims();
-    value_shape[0] = 1;
-    for (const auto &it : non_keys_pair) {
-      const auto key = it.first;
-      const auto index = it.second;
-      framework::Tensor value;
-      value.Resize(value_shape);
-      auto data = value.mutable_data<float>(cpu);
-
-      std::minstd_rand engine;
-      engine.seed(seed);
-      std::uniform_real_distribution<float> dist(min, max);
-      int64_t size = value.numel();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
-      w_t->Set(key, value);
-      memory::Copy(cpu, out_t->mutable_data<float>(cpu) + index * value.numel(),
-                   cpu, value.data<float>(), value.numel() * sizeof(float));
-    }
+    w_t->Get(ids_t, out_t, true);
   }
 };
 
@@ -121,21 +87,6 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
         .SetDefault(kNoPadding);
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
-        .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximum value of uniform random")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed used for generating samples. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
-        .SetDefault(0);
     AddAttr<bool>("auto_grown_table",
                   "(bool default false)"
                   "Whether create new value if for nonexistent key.")
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 70f88f24f682e05972ca73ef7b50f96be50d1ef4..8dcf7c99f3860789dee834787eeb8b7ad4cc3530 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -90,6 +90,25 @@ class Blas {
   void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A,
             int lda, const T* B, int ldb, T beta, T* C, int ldc) const;
 
+#ifdef PADDLE_WITH_MKLML
+  template <typename T>
+  T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N,
+                const int K) const;
+
+  template <typename T>
+  void GEMM_PACK(const CBLAS_IDENTIFIER id, const CBLAS_TRANSPOSE trans, int M,
+                 int N, int K, const T alpha, const T* src, const int ld,
+                 T* dst) const;
+
+  template <typename T>
+  void GEMM_COMPUTE(int transA, int transB, int M, int N, int K, const T* A,
+                    const int lda, const T* B, const int ldb, T beta, T* C,
+                    const int ldc) const;
+
+  template <typename T>
+  void GEMM_FREE(T* data) const;
+#endif
+
   template <typename T>
   void MatMul(const framework::Tensor& mat_a, bool trans_a,
               const framework::Tensor& mat_b, bool trans_b, T alpha,
@@ -115,6 +134,9 @@ class Blas {
   template <typename T>
   void VADD(int n, const T* x, const T* y, T* z) const;
 
+  template <typename T>
+  void VMUL(int n, const T* x, const T* y, T* z) const;
+
   template <typename T>
   void VCOPY(int n, const T* x, T* y) const;
 
@@ -146,6 +168,28 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template GEMM<T>(args...);
   }
 
+#ifdef PADDLE_WITH_MKLML
+  template <typename... ARGS>
+  T* GEMM_ALLOC(ARGS... args) const {
+    return Base()->template GEMM_ALLOC<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void GEMM_PACK(ARGS... args) const {
+    Base()->template GEMM_PACK<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void GEMM_COMPUTE(ARGS... args) const {
+    Base()->template GEMM_COMPUTE<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void GEMM_FREE(ARGS... args) const {
+    Base()->template GEMM_FREE<T>(args...);
+  }
+#endif
+
   template <typename... ARGS>
   void MatMul(ARGS... args) const {
     Base()->template MatMul<T>(args...);
@@ -161,6 +205,11 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template VADD<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VMUL(ARGS... args) const {
+    Base()->template VMUL<T>(args...);
+  }
+
   template <typename... ARGS>
   void VCOPY(ARGS... args) const {
     Base()->template VCOPY<T>(args...);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index a0802ef90ca7e30a2b22d187cb9092163518d8e9..dc77b6d793702458a22a2f59b68e9d9f2c23b4ff 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -31,6 +31,26 @@ struct CBlas<float> {
     platform::dynload::cblas_sgemm(args...);
   }
 
+  template <typename... ARGS>
+  static float *GEMM_ALLOC(ARGS... args) {
+    return platform::dynload::cblas_sgemm_alloc(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_PACK(ARGS... args) {
+    platform::dynload::cblas_sgemm_pack(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_COMPUTE(ARGS... args) {
+    platform::dynload::cblas_sgemm_compute(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_FREE(ARGS... args) {
+    platform::dynload::cblas_sgemm_free(args...);
+  }
+
 #ifdef PADDLE_WITH_LIBXSMM
   template <typename... ARGS>
   static void SMM_GEMM(ARGS... args) {
@@ -62,6 +82,11 @@ struct CBlas<float> {
   static void VADD(ARGS... args) {
     platform::dynload::vsAdd(args...);
   }
+
+  template <typename... ARGS>
+  static void VMUL(ARGS... args) {
+    platform::dynload::vsMul(args...);
+  }
 };
 
 template <>
@@ -71,6 +96,26 @@ struct CBlas<double> {
     platform::dynload::cblas_dgemm(args...);
   }
 
+  template <typename... ARGS>
+  static double *GEMM_ALLOC(ARGS... args) {
+    return platform::dynload::cblas_dgemm_alloc(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_PACK(ARGS... args) {
+    platform::dynload::cblas_dgemm_pack(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_COMPUTE(ARGS... args) {
+    platform::dynload::cblas_dgemm_compute(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_FREE(ARGS... args) {
+    platform::dynload::cblas_dgemm_free(args...);
+  }
+
 #ifdef PADDLE_WITH_LIBXSMM
   template <typename... ARGS>
   static void SMM_GEMM(ARGS... args) {
@@ -102,6 +147,11 @@ struct CBlas<double> {
   static void VADD(ARGS... args) {
     platform::dynload::vdAdd(args...);
   }
+
+  template <typename... ARGS>
+  static void VMUL(ARGS... args) {
+    platform::dynload::vdMul(args...);
+  }
 };
 
 #else
@@ -159,6 +209,7 @@ struct CBlas<platform::float16> {
   static void SMM_GEMM(...) {
     PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
   }
+  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
     PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -224,6 +275,41 @@ inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
                  beta, C, ldc);
 }
 
+#ifdef PADDLE_WITH_MKLML
+template <>
+template <typename T>
+T *Blas<platform::CPUDeviceContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
+                                                const int M, const int N,
+                                                const int K) const {
+  return CBlas<T>::GEMM_ALLOC(id, M, N, K);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
+                                                 const CBLAS_TRANSPOSE trans,
+                                                 int M, int N, int K,
+                                                 const T alpha, const T *src,
+                                                 const int ld, T *dst) const {
+  CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM_COMPUTE(
+    int transA, int transB, int M, int N, int K, const T *A, const int lda,
+    const T *B, const int ldb, T beta, T *C, const int ldc) const {
+  CBlas<T>::GEMM_COMPUTE(CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb,
+                         beta, C, ldc);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM_FREE(T *data) const {
+  CBlas<T>::GEMM_FREE(data);
+}
+#endif
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -299,6 +385,20 @@ void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
 #endif
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
+                                            T *z) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VMUL(n, x, y, z);
+#else
+  // try to find if openblas support vmul
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+#endif
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 58b85abf822741905a4e9547823b6cdbe645d39a..0de58d5fddd84d33f708c4c73e5a19dc2fe8a86b 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -15,25 +15,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T>
-HOSTDEVICE T log(const T& val) {
-  return std::log(val);
-}
-
-template <>
-HOSTDEVICE platform::float16 log(const platform::float16& val) {
-  // strage bug, hlog is not exists.
-  return static_cast<float16>(0);
-  // half tmp = static_cast<half>(val);
-  // return static_cast<platform::float16>(hlog(tmp));
-}
-
 namespace {
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
@@ -49,12 +35,12 @@ template <typename T>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                        const int class_num) {
   int tid = threadIdx.x;
-  T val(0);
+  T val = 0;
 
   int idx = blockIdx.x * class_num + tid;
   int end = blockIdx.x * class_num + class_num;
   for (; idx < end; idx += blockDim.x) {
-    val += math::TolerableValue<T>()(log(X[idx])) * label[idx];
+    val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
   }
 
   val = paddle::platform::reduceSum(val, tid, blockDim.x);
@@ -98,8 +84,6 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 
 template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
 template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext,
-                                   platform::float16>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index 2e4e4781c2eee1d9a0fc6760093a424ab3d5eb9d..adc5b3fe47cd3bf524eb56747b6bd51e345a2eb6 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
@@ -35,21 +33,6 @@ struct TolerableValue {
   }
 };
 
-// float16 value clip behave different.
-using paddle::platform::float16;
-using paddle::platform::isfinite;
-template <>
-struct TolerableValue<float16> {
-  HOSTDEVICE float16 operator()(const float16& x) const {
-    if (isfinite(x))
-      return x;
-    else if (x > static_cast<float16>(0))
-      return std::numeric_limits<float16>::max();
-    else
-      return std::numeric_limits<float16>::min();
-  }
-};
-
 template <typename DeviceContext, typename T>
 class CrossEntropyFunctor {
  public:
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..8600fa9e2c4db9d54cbe0ffb68f82d52c086d4f7
--- /dev/null
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/math/blas.h"
+
+DECLARE_int32(paddle_num_threads);
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
+                      const int N, const int K, const T* X, const T* W, T* Y,
+                      const T* B = NULL) {
+  blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast<T>(1), X, W,
+            static_cast<T>(0), Y);
+  if (B) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
+#endif
+    for (int i = 0; i < M; i++) {
+      blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 00dbfc11a239da70ec81e3498d2f4d5e5bf1c63f..a92762c7fea865fad2c7784736cce93a8af21892 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -77,7 +76,6 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
 
 template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
 
 namespace {
 template <typename T, int block_size>
@@ -122,7 +120,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     auto* out_data = output->data<T>();
 
     SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(context, output, static_cast<T>(0));
+    functor(context, output, 0.0);
 
     const int block_size = 256;
     dim3 threads(block_size, 1);
@@ -140,8 +138,6 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
 
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
-                                      platform::float16>;
 
 template <typename T>
 struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
@@ -181,8 +177,6 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext,
-                                  platform::float16>;
 
 namespace {
 template <typename T, int block_size>
@@ -235,8 +229,6 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
-                                        platform::float16>;
 
 namespace scatter {
 
@@ -284,7 +276,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
         context.GetPlace());
 
     math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), static_cast<T>(0));
+    constant_functor(context, out.mutable_value(), 0.0);
 
     auto* out_data = out.mutable_value()->data<T>();
     auto* input_data = input.value().data<T>();
@@ -308,7 +300,6 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
 template struct MergeAdd<platform::CUDADeviceContext, double>;
 template struct MergeAdd<platform::CUDADeviceContext, int>;
 template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
-template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
 
 template <typename T, int block_size>
 __global__ void UpdateToTensorKernel(const T* selected_rows,
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 785c4baecbf056d08930f4bb704aec067a2db4a2..3effe776258cb541dbba32f63eda457d917011f4 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -94,15 +94,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
 template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<double>;
-template class SoftmaxGradCUDNNFunctor<platform::float16>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 
 template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext,
-                                  platform::float16>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 07aa23754f9786c56c0be14c2a71d5290d2cccf7..91e0ab28efc21d4376524c8ecf66b429d51d8847 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -12,16 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#define EIGEN_USE_GPU
+
 #include "paddle/fluid/operators/mean_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index a41d50ae0b99797800078184f7ffeb366367f493..362e9f9ae8b2f0f77198e3f3939211ae1117b27b 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
     IG->mutable_data<T>(context.GetPlace());
 
     T ig_size = static_cast<T>(IG->numel());
-    Eigen::DSizes<int, 1> bcast(static_cast<int>(ig_size));
+    Eigen::DSizes<int, 1> bcast(ig_size);
 
     EigenVector<T>::Flatten(*IG).device(
         *context.template device_context<DeviceContext>().eigen_device()) =
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 51993398bd3427e1f0da155918395bc50fa65e45..2a8e4af516ce9341772d4668dc993215b4aae24d 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -54,9 +54,9 @@ class MulOp : public framework::OperatorWithKernel {
     auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
     auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
 
-    PADDLE_ENFORCE_EQ(
-        x_mat_dims[1], y_mat_dims[0],
-        "First matrix's width must be equal with second matrix's height.");
+    PADDLE_ENFORCE_EQ(x_mat_dims[1], y_mat_dims[0],
+                      "First matrix's width must be equal with second matrix's "
+                      "height. %s, %s");
     std::vector<int64_t> output_dims;
     output_dims.reserve(
         static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc
index 6c5a83c6a50c463502171f09bbf18e17e43917b5..81f3e42bf412fa4d2cb48405f2f8ee49b6aa0b67 100644
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -20,7 +20,6 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
                         ops::MulKernel<plat::CUDADeviceContext, double>,
                         ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
-    ops::MulGradKernel<plat::CUDADeviceContext, double>,
-    ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(mul_grad,
+                        ops::MulGradKernel<plat::CUDADeviceContext, float>,
+                        ops::MulGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
index ce0ddd89bfb0d73e237a6f9a777376624d8ef2d4..cdcba8035762d8f442eb8b8ed52a4e3e99ac31b6 100644
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
@@ -1,3 +1,3 @@
-if(WITH_GPU)
+if(WITH_GPU AND NOT WIN32)
   nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
 endif()
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 9fdbee818a217842e47c8ab11b84c6d5513ad219..31f083565fddee66aea1485ed71f41b6199f4502 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -174,8 +174,7 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
+                   ops::PoolCUDNNGradOpKernel<double>);
 
 REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<float>,
@@ -183,5 +182,4 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
+                   ops::PoolCUDNNGradOpKernel<double>);
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 4a6ce938a5f337d035b21f562d46daf606236db0..a1f368e8690512cec2db7593aabc0279bbe174eb 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -57,6 +57,8 @@ class RecvOp : public framework::OperatorBase {
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
+    AddInput("X", "(Any) Dummy inputs, used for control dependency")
+        .AsDuplicable();
     AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
     AddComment(R"DOC(
 Recv operator
diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..724463c95c4a29fb5c00fe791b389d3908771640
--- /dev/null
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sampling_id_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class SamplingIdOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SamplingIdOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SamplingIdOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
+        "min must less then max");
+
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 2,
+                   "Input(X, Filter) should be 2-D tensor.");
+
+    framework::DDim dims = input_dims;
+    ctx->SetOutputDim("Out", dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of softmax. "
+             "2-D with shape [batch_size, input_feature_dimensions].");
+    AddOutput("Out", "SamplingId data tensor.");
+    AddComment(R"DOC(
+SamplingId Operator.
+A layer for sampling id from multinomial distribution from the
+ input. Sampling one id for one sample.)DOC");
+    AddAttr<float>("min", "Minimum value of random. [default 0.0].")
+        .SetDefault(0.0f);
+    AddAttr<float>("max", "Maximun value of random. [default 1.0].")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed used for the random number engine. "
+                 "0 means use a seed generated by the system."
+                 "Note that if seed is not 0, this operator will always "
+                 "generate the same random numbers every time. [default 0].")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sampling_id, ops::SamplingIdOp, ops::SamplingIdOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
+                       paddle::operators::SamplingIdKernel<double>);
diff --git a/paddle/fluid/operators/sampling_id_op.cu b/paddle/fluid/operators/sampling_id_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a4f0470314d00b5e370fd478736b54579c88448c
--- /dev/null
+++ b/paddle/fluid/operators/sampling_id_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "paddle/fluid/operators/sampling_id_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
+                        paddle::operators::SamplingIdKernel<double>);
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f730a9746da56ca82090122193ec54efb774483e
--- /dev/null
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <random>
+#include <sstream>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SamplingIdKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("X");
+    const int batch_size = static_cast<int>(input->dims()[0]);
+    const int width = static_cast<int>(input->dims()[1]);
+
+    PADDLE_ENFORCE_GE(batch_size, 0,
+                      "batch_size(dims[0]) must be nonnegative.");
+    PADDLE_ENFORCE_GE(width, 0, "width(dims[1]) must be nonnegative.");
+
+    std::vector<T> ins_vector;
+    framework::TensorToVector(*input, context.device_context(), &ins_vector);
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(context.Attr<float>("min")),
+        static_cast<T>(context.Attr<float>("max")));
+
+    std::vector<T> ids(batch_size);
+    for (size_t i = 0; i < batch_size; ++i) {
+      T r = dist(engine);
+      int idx = width - 1;
+      for (int j = 0; j < width; ++j) {
+        if ((r -= ins_vector[i * width + j]) < 0) {
+          idx = j;
+          break;
+        }
+      }
+      ids[i] = ins_vector[i * width + idx];
+    }
+
+    std::vector<int64_t> out_dim;
+    out_dim.push_back(static_cast<int64_t>(batch_size));
+
+    Tensor* output = context.Output<Tensor>("Out");
+    output->Resize(framework::make_ddim(out_dim));
+    output->mutable_data<T>(context.GetPlace());
+    framework::TensorFromVector(ids, context.device_context(), output);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 201a51130d6b6f94104e2dabf9e7facffa672ae0..85de37416b5f24128ee98320a872eafffe967c81 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -142,6 +142,8 @@ class SaveOp : public framework::OperatorBase {
     std::string filename = lt_var->data();
     VLOG(4) << "SaveSelectedRows get File name: " << filename;
 
+    MkDirRecursively(DirName(filename).c_str());
+
     auto &selectedRows = var->Get<framework::SelectedRows>();
 
     // get device context from pool
diff --git a/paddle/fluid/operators/scale_op.cu b/paddle/fluid/operators/scale_op.cu
index d266867046334f95eaaf4b7a9acb3fec20f1e439..04c802da12958a53626f533833c2709110531136 100644
--- a/paddle/fluid/operators/scale_op.cu
+++ b/paddle/fluid/operators/scale_op.cu
@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/platform/float16.h"
 
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     scale,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   plat::float16>);
+                                   int64_t>);
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index bf5e0d864495ce3a651a31c9d5a7664fe9eb2396..c32d2603cf76f55a9e723196977b0a70c92d597a 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -81,8 +81,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "The source input of scatter op");
     AddInput("Ids", "The index input of scatter op where X will be updated");
-    AddInput("Updates", "The updated value of updates op");
-    AddOutput("Out", "The output of add op");
+    AddInput("Updates", "The updated value of scatter op");
+    AddOutput("Out", "The output of scatter op");
     AddComment(R"DOC(
 Scatter Operator.
 
@@ -90,7 +90,7 @@ This operator obtains output by updating the input on selected indices on the fi
 
 $$
 Out = X \\
-Out[Ids] = X[Ids] + Updates
+Out[Ids] = Updates
 $$
 
 )DOC");
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index d29947b55e751a3e7993f765198364f4debe2472..2eefbba9726af4d38b40d91e9242faa2923dca20 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -34,9 +34,9 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     auto *Updates = ctx.Input<Tensor>("Updates");
     auto *Out = ctx.Output<Tensor>("Out");
 
-    // In place output: Out = X, Out[Ids] += Updates
-    Out->ShareDataWith(*X);
-    // Apply ScatterUpdate: Out[index] += Updates[:]
+    // In place output: Out = X, Out[Ids] = Updates
+    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
+    // Apply ScatterUpdate: Out[index] = Updates[:]
     ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
   }
 };
@@ -53,9 +53,9 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
     // In place gradient: dX = dO
-    dX->ShareDataWith(*dOut);
+    framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
     dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates += dO[Ids]
+    // Gradient by Gather: dUpdates = dO[Ids]
     CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
   }
 };
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 1866a86048acbefadcb4d82cd6309cd16f0352d6..14b07649c416ff1b671fc9b5ee4eb956b44570c5 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -37,22 +37,19 @@ class SendBarrierOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-    bool sync_mode = Attr<bool>("sync_mode");
 
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
-    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
+    VLOG(3) << "SendBarrierOp sync";
 
     // need to wait before sending send_barrier message
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
-    if (sync_mode) {
-      for (auto& ep : eps) {
-        VLOG(3) << "send barrier, ep: " << ep;
-        rpc_client->AsyncSendBatchBarrier(ep);
-      }
-      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+    for (auto& ep : eps) {
+      VLOG(3) << "send barrier, ep: " << ep;
+      rpc_client->AsyncSendBatchBarrier(ep);
     }
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
 };
 
@@ -70,7 +67,6 @@ the Parameter Server would knew all variables have been sent.
                                       "(string vector, default 127.0.0.1:6164)"
                                       "Server endpoints to send variables to.")
         .SetDefault({"127.0.0.1:6164"});
-    AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
   }
 };
 
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 3cd42f2d059532b7090e66ce21de8e5cb014adf1..82a70e4bf13247d784371ffdf419c9f792d7f721 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -66,6 +66,8 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() {
     AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
         .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
     AddComment(R"DOC(
 Send operator
 
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index 2685ce217ee0f0d3e89f3751e96218dcd19bead4..d8b0165b2a89b04bd55671a37d96ee4ba275b2eb 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -111,7 +111,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < grad.rows().size(); i++) {
         PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                        "Input rows index should less than height");
-        int64_t id_index = param.Index(grad.rows()[i]);
+        int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
         PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
                           "id should be in the table");
         for (int64_t j = 0; j < grad_row_width; j++) {
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index c2d45c3d2ef82683352afe0e72f0330f7cd753f6..2bdb23e999621b10799b5163f326bc4b66a437e6 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -78,5 +78,4 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
                    ops::SoftmaxCUDNNKernel<float>,
                    ops::SoftmaxCUDNNKernel<plat::float16>);
 REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>,
-                   ops::SoftmaxGradCUDNNKernel<plat::float16>);
+                   ops::SoftmaxGradCUDNNKernel<float>);
diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
index 19359b7eef5126d84f0707d39095a74ae4561186..5fb4f011d9b47cebc4a23bcce47eada825263343 100644
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -23,5 +23,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 6c507baf3a0ab0a557d29a53700685753616193b..8a683116b8054de12fc4419b5aa5fbc019b675bb 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -23,9 +23,9 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SqueezeOp should not be null.");
+                   "Input(X) of Squeeze operator should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SqueezeOp should not be null.");
+                   "Output(Out) of Squeeze operator should not be null.");
 
     const auto &x_dims = ctx->GetInputDim("X");
     // Check input tensor dims (<6) Eigen limit.
@@ -107,7 +107,6 @@ class SqueezeOp : public framework::OperatorBase {
 
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(out_dims);
-    attrs["inplace"] = Attr<bool>("inplace");
     // Invoke Reshape Op
     auto reshape_op = framework::OpRegistry::CreateOp(
         "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
@@ -125,12 +124,6 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
                               "(std::vector<int>). List of integers,"
                               " indicating the dimensions to squeeze.")
         .SetDefault({});
-    AddAttr<bool>("inplace",
-                  "(default: false) Squeeze the source tensor's shape without "
-                  "memory copy. When Attr(inplace) is set true, the output "
-                  "tensor shares memory with Input(X), otherwise, a new output "
-                  "tensor is created, and its data are copied from Input(x).")
-        .SetDefault(false);
     AddComment(R"DOC(
         Squeeze Operator.
         
@@ -180,7 +173,6 @@ class SqueezeGradOp : public framework::OperatorBase {
     auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(x_dims);
-    attrs["inplace"] = Attr<bool>("inplace");
 
     auto reshape_op = framework::OpRegistry::CreateOp(
         "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc
index d2035777ee2289291a02594ee289156504df09d9..f9a16ef35ecb9eeb6c8eda9d124ecb17e7f9d5ce 100644
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
@@ -34,15 +34,15 @@
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
-using paddle::platform::CPUDeviceContext;
 using framework::DataLayout;
 using mkldnn::memory;
 using mkldnn::primitive;
+using mkldnn::reorder;
 using mkldnn::stream;
 using mkldnn::sum;
-using mkldnn::reorder;
+using paddle::framework::Tensor;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::MKLDNNDeviceContext;
 using platform::to_void_cast;
 
 template <typename T>
@@ -175,18 +175,35 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         auto& sel_row = get_selected_row(i);
         first_dim += sel_row.rows().size();
       }
-      auto in_dim =
-          framework::vectorize(get_selected_row(N - 1).value().dims());
+
+      std::vector<int64_t> in_dim;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        if (sel_row.rows().size() > 0) {
+          in_dim = framework::vectorize(sel_row.value().dims());
+          break;
+        }
+      }
+
+      if (in_dim.empty()) {
+        VLOG(3) << "WARNING: all the inputs are empty";
+        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+      } else {
+        in_dim[0] = static_cast<int64_t>(first_dim);
+      }
+
       in_dim[0] = static_cast<int64_t>(first_dim);
 
       out_value->Resize(framework::make_ddim(in_dim));
 
+      out_value->mutable_data<T>(ctx.GetPlace());
+
       // if all the input sparse vars are empty, no need to
       // merge these vars.
       if (first_dim == 0UL) {
         return;
       }
-      out_value->mutable_data<T>(ctx.GetPlace());
+
       math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
       int64_t offset = 0;
       for (int i = 0; i < N; i++) {
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index db4c2d6c115f04b436db00854ca4b02fea09866b..89bcd1bbc86dc29cb7b98cbef3057a8f98c74555 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -11,13 +11,10 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sum_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index dda6772796c821ffb813e73da0c34370e5339001..6dffe527c1072ee97fcde1725bfc1a47ed1ad74a 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> {
       if (!in_place) {
         math::SetConstant<DeviceContext, T> constant_functor;
         constant_functor(context.template device_context<DeviceContext>(), out,
-                         static_cast<T>(0));
+                         0.0);
       }
 
       math::SelectedRowsAddToTensor<DeviceContext, T> functor;
@@ -105,18 +105,30 @@ class SumKernel : public framework::OpKernel<T> {
         auto &sel_row = get_selected_row(i);
         first_dim += sel_row.rows().size();
       }
-      auto in_dim =
-          framework::vectorize(get_selected_row(N - 1).value().dims());
-      in_dim[0] = static_cast<int64_t>(first_dim);
+
+      std::vector<int64_t> in_dim;
+      for (int i = 0; i < N; i++) {
+        auto &sel_row = get_selected_row(i);
+        if (sel_row.rows().size() > 0) {
+          in_dim = framework::vectorize(sel_row.value().dims());
+          break;
+        }
+      }
+      if (in_dim.empty()) {
+        VLOG(3) << "WARNING: all the inputs are empty";
+        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+      } else {
+        in_dim[0] = static_cast<int64_t>(first_dim);
+      }
 
       out_value->Resize(framework::make_ddim(in_dim));
+      out_value->mutable_data<T>(context.GetPlace());
 
       // if all the input sparse vars are empty, no need to
       // merge these vars.
       if (first_dim == 0UL) {
         return;
       }
-      out_value->mutable_data<T>(context.GetPlace());
 
       math::SelectedRowsAddTo<DeviceContext, T> functor;
 
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index ee3078876c15b06a887064f08dc0c05d450b5f77..1048d3017140c9e31426a1580b2862667116a024 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -17,112 +17,16 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/engine.h"
-#include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
 
 DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
+DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");
 
 namespace operators {
 
-using inference::Singleton;
-using inference::tensorrt::TRT_EngineManager;
-
-using FluidDT = framework::proto::VarType_Type;
-using TRT_DT = nvinfer1::DataType;
-
-namespace {
-
-TRT_DT FluidDataType2TRT(FluidDT type) {
-  switch (type) {
-    case FluidDT::VarType_Type_FP32:
-      return TRT_DT::kFLOAT;
-    case FluidDT::VarType_Type_INT32:
-      return TRT_DT::kINT32;
-    default:
-      return TRT_DT::kINT32;
-  }
-  PADDLE_THROW("unkown type");
-  return TRT_DT::kINT32;
-}
-
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
-  PADDLE_ENFORCE_GT(shape.size(), 1UL,
-                    "TensorRT' tensor input requires at least 2 dimensions");
-  PADDLE_ENFORCE_LE(shape.size(), 4UL,
-                    "TensorRT' tensor input requires at most 4 dimensions");
-  PADDLE_ENFORCE_EQ(shape.size(), 4UL);
-  return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
-}
-
-}  // namespace
-
-template <typename DeviceContext, typename T>
-void TensorRTEngineKernel<DeviceContext, T>::Prepare(
-    const framework::ExecutionContext &context) const {
-  VLOG(4) << "Prepare engine";
-  // Get the ProgramDesc and pass to convert.
-  framework::proto::BlockDesc block_desc;
-  block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-  int max_batch = context.Attr<int>("max_batch");
-  auto max_workspace = context.Attr<int>("max_workspace");
-  auto params = context.Attr<std::vector<std::string>>("parameters");
-  std::unordered_set<std::string> parameters;
-  for (const auto &param : params) {
-    parameters.insert(param);
-  }
-
-  std::vector<std::string> output_maps =
-      context.Attr<std::vector<std::string>>("output_name_mapping");
-
-  // TODO(Superjomn) replace this with a different stream
-  auto *engine = Singleton<TRT_EngineManager>::Global().Create(
-      max_batch, max_workspace, nullptr /*engine hold its own stream*/,
-      context.Attr<std::string>("engine_uniq_key"));
-  engine->InitNetwork();
-
-  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
-  VLOG(4) << "parsed var size " << block.AllVars().size();
-  // Add inputs
-  VLOG(4) << "declare inputs";
-  for (auto &input : context.Inputs("Xs")) {
-    if (parameters.count(input)) continue;
-    VLOG(4) << "declare input " << input;
-    auto *var = block.FindVar(input);
-    // TensorRT engine need to create parameters. The parameter's description
-    // should be set in
-    PADDLE_ENFORCE(var, "no variable called %s", input);
-    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                      "TensorRT engine only takes LoDTensor as input");
-    auto shape = var->GetShape();
-    // For the special batch_size placeholder -1, drop it and pass the real
-    // shape of data.
-    // TODO(Superjomn) fix this with batch broadcast, or it can't handle
-    // variational batch size.
-    if (shape[0] == -1) {
-      shape[0] = FLAGS_tensorrt_engine_batch_size;
-    }
-    engine->DeclareInput(
-        input, FluidDataType2TRT(
-                   var->Proto()->type().lod_tensor().tensor().data_type()),
-        Vec2TRT_Dims(shape));
-  }
-
-  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block_desc, parameters, context.scope(), engine);
-
-  // Add outputs
-  for (auto &output : output_maps) {
-    engine->DeclareOutput(output);
-  }
-
-  engine->FreezeNetwork();
-}
-
 class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -130,8 +34,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
     AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
-    AddAttr<int>("max_batch", "the maximum batch size.");
-    AddAttr<int>("max_workspace", "the maximum batch size.");
     AddComment("TensorRT engine operator.");
   }
 };
@@ -150,11 +52,4 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
                   ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    tensorrt_engine,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cu.cc b/paddle/fluid/operators/tensorrt_engine_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1ddfde6d51ef719ca0b89cf286b176195ee682a
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    tensorrt_engine,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 2cbe1213a2f428a3ce56b06f97636baeb4b66c26..bc556ab3643cefa3e45d2a8a3835937753af723f 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -19,16 +19,51 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 
 namespace paddle {
 
 DECLARE_int32(tensorrt_engine_batch_size);
+DECLARE_int32(tensorrt_max_batch_size);
+DECLARE_int32(tensorrt_workspace_size);
 
 namespace operators {
 
+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+
+namespace {
+
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
+  if (shape.size() == 4UL)
+    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
+  return nvinfer1::DimsCHW(shape[1], 1, 1);
+}
+
+}  // namespace
+
 using inference::Singleton;
 using inference::tensorrt::TRT_EngineManager;
 
@@ -47,7 +82,7 @@ class TensorRTEngineOp : public framework::OperatorWithKernel {
                                   .FindVar(input0)
                                   ->GetMutable<framework::LoDTensor>()
                                   ->type()),
-        platform::CPUPlace());
+        ctx.GetPlace());
     return kt;
   }
 };
@@ -64,7 +99,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     auto input_names = context.op().Inputs("Xs");
     PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
     PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
-                      context.Attr<int>("max_batch"));
+                      FLAGS_tensorrt_max_batch_size);
 
     std::vector<std::string> output_maps =
         context.Attr<std::vector<std::string>>("output_name_mapping");
@@ -94,12 +129,19 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
 
     // Convert output tensor from engine to fluid
     int output_index = 0;
+    VLOG(4) << "TensorRT Engine Op Outputs:";
     for (const auto& y : context.Outputs("Ys")) {
+      VLOG(4) << y;
       // convert output and copy to fluid.
       nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
-      std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
+      // The ITensor doesn't contain the batch size dim.
+      std::vector<int> ddim;
+      ddim.push_back(FLAGS_tensorrt_engine_batch_size);
+      for (int i = 0; i < dims.nbDims; i++) {
+        ddim.push_back(dims.d[i]);
+      }
 
       auto* fluid_v = context.scope().FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
@@ -113,9 +155,11 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       // TODO(Superjomn) change this float to dtype size.
       auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
                   FLAGS_tensorrt_engine_batch_size;
-      engine->GetOutputInCPU(output_maps[output_index],
-                             fluid_t->mutable_data<float>(platform::CPUPlace()),
-                             size * sizeof(float));
+      engine->GetOutputInGPU(
+          output_maps[output_index],
+          fluid_t->mutable_data<float>(platform::CUDAPlace(
+              boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
+          size * sizeof(float));
       //} else {
       // engine->GetOutputInGPU(
       // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
@@ -128,8 +172,67 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
   }
 
  protected:
-  // Build the engine.
-  void Prepare(const framework::ExecutionContext& context) const;
+  void Prepare(const framework::ExecutionContext& context) const {
+    VLOG(4) << "Prepare engine";
+    // Get the ProgramDesc and pass to convert.
+    framework::proto::BlockDesc block_desc;
+    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
+    int max_batch = FLAGS_tensorrt_max_batch_size;
+    auto max_workspace = FLAGS_tensorrt_workspace_size;
+    auto params = context.Attr<std::vector<std::string>>("parameters");
+    std::unordered_set<std::string> parameters;
+    for (const auto& param : params) {
+      parameters.insert(param);
+    }
+
+    std::vector<std::string> output_maps =
+        context.Attr<std::vector<std::string>>("output_name_mapping");
+
+    // TODO(Superjomn) replace this with a different stream
+    auto* engine = Singleton<TRT_EngineManager>::Global().Create(
+        max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+        context.Attr<std::string>("engine_uniq_key"),
+        boost::get<platform::CUDAPlace>(context.GetPlace()).device);
+
+    engine->InitNetwork();
+
+    framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+    VLOG(4) << "parsed var size " << block.AllVars().size();
+    // Add inputs
+    VLOG(4) << "declare inputs";
+    for (auto& input : context.Inputs("Xs")) {
+      if (parameters.count(input)) continue;
+      VLOG(4) << "declare input " << input;
+      auto* var = block.FindVar(input);
+      // TensorRT engine need to create parameters. The parameter's description
+      // should be set in
+      PADDLE_ENFORCE(var, "no variable called %s", input);
+      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                        "TensorRT engine only takes LoDTensor as input");
+      auto shape = var->GetShape();
+      // For the special batch_size placeholder -1, drop it and pass the real
+      // shape of data.
+      // TODO(Superjomn) fix this with batch broadcast, or it can't handle
+      // variational batch size.
+      if (shape[0] == -1) {
+        shape[0] = FLAGS_tensorrt_engine_batch_size;
+      }
+      engine->DeclareInput(
+          input, FluidDataType2TRT(
+                     var->Proto()->type().lod_tensor().tensor().data_type()),
+          Vec2TRT_Dims(shape));
+    }
+
+    inference::Singleton<inference::tensorrt::OpConverter>::Global()
+        .ConvertBlock(block_desc, parameters, context.scope(), engine);
+
+    // Add outputs
+    for (auto& output : output_maps) {
+      engine->DeclareOutput(output);
+    }
+
+    engine->FreezeNetwork();
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 37657fa0b0498986fe67027415279af1775e58b9..27c1d29762b3de5e57f877b271aae52e71eb7cf9 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -23,20 +24,20 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
-USE_CPU_ONLY_OP(tensorrt_engine);
+USE_CUDA_ONLY_OP(tensorrt_engine);
 
 namespace paddle {
 namespace operators {
 
 namespace {
-void CreateCPUTensor(framework::Scope* scope, const std::string& name,
-                     const std::vector<int64_t>& shape) {
+void CreateCUDATensor(framework::Scope* scope, const std::string& name,
+                      const std::vector<int64_t>& shape) {
   auto* var = scope->Var(name);
   auto* tensor = var->GetMutable<framework::LoDTensor>();
   auto dims = framework::make_ddim(shape);
   tensor->Resize(dims);
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
   inference::tensorrt::RandomizeTensor(tensor, place, ctx);
 }
 
@@ -57,6 +58,8 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
 using inference::analysis::SetAttr;
 
 TEST(TensorRTEngineOp, manual) {
+  FLAGS_tensorrt_engine_batch_size = 2;
+  FLAGS_tensorrt_max_batch_size = 2;
   framework::ProgramDesc program;
   auto* block_ = program.Proto()->add_blocks();
   block_->set_idx(0);
@@ -98,8 +101,6 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
-  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
   SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                     std::vector<std::string>({}));
@@ -112,15 +113,15 @@ TEST(TensorRTEngineOp, manual) {
   LOG(INFO) << "engine_op " << engine_op.get();
 
   framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
   // Prepare variables.
-  CreateCPUTensor(&scope, "x", std::vector<int64_t>({2, 4}));
-  CreateCPUTensor(&scope, "y", std::vector<int64_t>({4, 6}));
-  CreateCPUTensor(&scope, "z", std::vector<int64_t>({2, 6}));
+  CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
 
-  CreateCPUTensor(&scope, "y0", std::vector<int64_t>({6, 8}));
-  CreateCPUTensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+  CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
 
   // Execute them.
   LOG(INFO) << "engine_op run";
@@ -128,10 +129,12 @@ TEST(TensorRTEngineOp, manual) {
 }
 
 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  FLAGS_tensorrt_engine_batch_size = batch_size;
+  FLAGS_tensorrt_max_batch_size = batch_size;
   framework::ProgramDesc program;
   framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
 
   auto* block_ = program.Proto()->add_blocks();
   block_->set_idx(0);
@@ -165,10 +168,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
 
     // Prepare variables.
     if (!x_created) {
-      CreateCPUTensor(&scope, x_name, std::vector<int64_t>(x_shape));
+      CreateCUDATensor(&scope, x_name, std::vector<int64_t>(x_shape));
     }
-    CreateCPUTensor(&scope, y_name, std::vector<int64_t>(y_shape));
-    CreateCPUTensor(&scope, z_name, std::vector<int64_t>(z_shape));
+    CreateCUDATensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
 
     // It is wired, need to copy manually.
     *block_->add_ops() = *fc->Proto();
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 5fc0784f665f9f4a4422ca9b70f7dc6001833a8f..9da8551eb2d7ea66ad434c42b54522432095ce29 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -11,19 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <limits>
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using paddle::platform::float16;
 
 template <typename T>
 struct Pair {
@@ -35,11 +32,6 @@ struct Pair {
     id = id;
   }
 
-  __device__ __forceinline__ void clear() {
-    v = -INFINITY;
-    id = -1;
-  }
-
   __device__ __forceinline__ void operator=(const Pair<T>& in) {
     v = in.v;
     id = in.id;
@@ -61,12 +53,6 @@ struct Pair {
   int64_t id;
 };
 
-template <>
-__device__ __forceinline__ void Pair<float16>::clear() {
-  v = platform::raw_uint16_to_float16(0x400);
-  id = -1;
-}
-
 template <typename T>
 __device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
                                       int beam_size) {
@@ -164,7 +150,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - (*beam)) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].clear();
+          topk[k].set(-INFINITY, -1);
         }
       }
       if (!(*is_empty)) {
@@ -174,7 +160,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
     }
 
     *max = topk[MaxLength - 1];
-    if ((*max).v == static_cast<T>(-1)) *is_empty = true;
+    if ((*max).v == -1) *is_empty = true;
     *beam = 0;
   }
 }
@@ -195,7 +181,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - *beam) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].set(std::numeric_limits<T>::min(), -1);
+          topk[k].set(-INFINITY, -1);
         }
       }
       if (!(*is_empty)) {
@@ -287,7 +273,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
   bool firststep = true;
 
   for (int k = 0; k < MaxLength; k++) {
-    topk[k].clear();
+    topk[k].set(-INFINITY, -1);
   }
   while (k) {
     ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
@@ -339,7 +325,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    top_k, paddle::operators::TopkOpCUDAKernel<float>,
-    paddle::operators::TopkOpCUDAKernel<double>,
-    paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
+                        paddle::operators::TopkOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index edd1baa4ace4e246190afcd12b0716f1dd38e243..5248767c2eeb9388c26d203e64f8b2c68ffe0865 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -30,8 +30,10 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<framework::LoDTensor>();
     } else if (out_var->IsType<framework::SelectedRows>()) {
       auto shape = ctx.Attr<std::vector<int>>("shape");
-      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      auto* selected_rows = out_var->GetMutable<framework::SelectedRows>();
+      tensor = selected_rows->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
+      selected_rows->mutable_rows()->reserve(shape[0]);
     } else {
       PADDLE_THROW(
           "uniform_random_op's output only"
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 2b8039a0c1bea07402435958608ea035ba862c90..e1c7323a30233f4ec4f60e46aa6088ee6d8601b7 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -11,14 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <glog/logging.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
@@ -40,11 +36,6 @@ struct UniformGenerator {
   }
 };
 
-template <typename T, typename V>
-struct CastFunctor {
-  HOSTDEVICE V operator()(const T& a) { return static_cast<V>(a); }
-};
-
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -75,50 +66,18 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     T max = static_cast<T>(context.Attr<float>("max"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    if (out_var->IsType<framework::LoDTensor>() &&
-        std::type_index(typeid(T)) ==
-            std::type_index(typeid(platform::float16))) {
-      framework::Tensor master_copy_tensor;
-      master_copy_tensor.Resize(tensor->dims());
-      float* master_copy_tensor_data =
-          master_copy_tensor.mutable_data<float>(context.GetPlace());
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<float>(master_copy_tensor_data),
-                        UniformGenerator<float>(static_cast<float>(min),
-                                                static_cast<float>(max), seed));
-      platform::Transform<platform::CUDADeviceContext> trans;
-      auto* in_begin = master_copy_tensor.data<float>();
-      auto* in_end = in_begin + master_copy_tensor.numel();
-      auto* out_begin = tensor->mutable_data<T>(context.GetPlace());
-      trans(context.template device_context<platform::CUDADeviceContext>(),
-            in_begin, in_end, out_begin, CastFunctor<float, T>());
-    } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        UniformGenerator<T>(min, max, seed));
-    }
-    if (VLOG_IS_ON(5)) {
-      framework::Tensor cpu_tensor;
-      framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
-      auto& dev_ctx =
-          *platform::DeviceContextPool::Instance().Get(context.GetPlace());
-      dev_ctx.Wait();
-      auto x = framework::EigenVector<T>::Flatten(cpu_tensor);
-      VLOG(5) << "The Uniform output " << x;
-    }
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    uniform_random, paddle::operators::GPUUniformRandomKernel<float>,
-    paddle::operators::GPUUniformRandomKernel<double>,
-    paddle::operators::GPUUniformRandomKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    uniform_random_batch_size_like,
-    paddle::operators::GPUUniformRandomKernel<float>,
-    paddle::operators::GPUUniformRandomKernel<double>,
-    paddle::operators::GPUUniformRandomKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(uniform_random,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index f2a15fdf572e0de30f9949dda5020e130b0c5585..0fc8d54f6400c9dfb6af1e764ed44e95195bfe6e 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -23,9 +23,9 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of UnsqueezeOp should not be null.");
+                   "Input(X) of Unsqueeze operator should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of UnsqueezeOp should not be null.");
+                   "Output(Out) of Unsqueeze operator should not be null.");
 
     const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
     const auto &x_dims = ctx->GetInputDim("X");
@@ -95,7 +95,6 @@ class UnsqueezeOp : public framework::OperatorBase {
 
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(out_dims);
-    attrs["inplace"] = Attr<bool>("inplace");
     // Invoke Reshape op.
     auto reshape_op = framework::OpRegistry::CreateOp(
         "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
@@ -126,13 +125,6 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
                            " within [1, 6] dimensions (Eigen limit).");
           }
         });
-    AddAttr<bool>(
-        "inplace",
-        "(default: false) Unsqueeze the source tensor's shape without "
-        "memory copy. When Attr(inplace) is set true, the output "
-        "tensor shares memory with Input(X), otherwise, a new output "
-        "tensor is created, and its data are copied from Input(x).")
-        .SetDefault(false);
     AddComment(R"DOC(
     Unsqueeze Operator.
     
@@ -168,7 +160,6 @@ class UnsqueezeGradOp : public framework::OperatorBase {
 
     framework::AttributeMap attrs;
     attrs["shape"] = framework::vectorize2int(x_dims);
-    attrs["inplace"] = Attr<bool>("inplace");
 
     auto reshape_op = framework::OpRegistry::CreateOp(
         "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 733157ea05ed39434b9a750e3a94ea548f512ce6..48e37796e1b4190e50602421106a105e4d4f6d74 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -57,12 +57,12 @@ class WhileOp : public framework::OperatorBase {
 
     PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
                    "Condition of while op must in CPU memory.");
+
+    auto ctx = executor.Prepare(*program, block->ID());
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
-
-      executor.Run(*program, &current_scope, block->ID(),
-                   false /*create_local_scope*/);
+      executor.RunPreparedContext(ctx.get(), &current_scope, false);
     }
   }
 };
@@ -109,6 +109,7 @@ class WhileGradOp : public framework::OperatorBase {
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
+    auto ctx = executor.Prepare(*program, block->ID());
 
     auto *step_scopes =
         scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
@@ -161,8 +162,7 @@ class WhileGradOp : public framework::OperatorBase {
           }
         }
       }
-
-      executor.Run(*program, *cur_scope_iter, block->ID(), false);
+      executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false);
 
       auto &pg_names = Outputs(kXGRAD);
       auto &p_names = Inputs(kX);
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index 23457ff5fe1ec27094113ba0dde26adc64c716b5..9f504d14a8da116648483c0f64cb511b46e6a97e 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -36,7 +36,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 #if CUDA_VERSION < 9000
   return __shfl_down(val, delta, width);
 #else
-  return __shfl_down_sync(mask, val, delta, width);
+  return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
 #endif
 }
 
@@ -46,9 +46,16 @@ template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
                                                        int width) {
-  half tmp = static_cast<half>(val);
-  __shfl_down(tmp, static_cast<unsigned>(delta), width);
-  return float16(tmp);
+  return float16(
+      __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
+}
+#else
+template <>
+__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
+                                                       float16 val, int delta,
+                                                       int width) {
+  return float16(__shfl_down_sync(mask, static_cast<half>(val),
+                                  static_cast<unsigned>(delta), width));
 }
 #endif
 
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
index ca5ca1caeb23f01c047feeccf9c276b2dcd1cb68..ee45afab93d079374aefe366425502890854c28d 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+#include <algorithm>
 #include <iostream>
 #include <random>
 
@@ -123,7 +124,7 @@ void TestUnalign(size_t num, const int shift_bit) {
   cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
   cudaDeviceSynchronize();
   for (size_t i = 0; i < num / 2; ++i) {
-    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // NOTE(dzhwinter): the float16 add has small truncate error.
     // so we use EXPECT_NEAR to check the result.
     EXPECT_NEAR(static_cast<float>(out[i]),
                 static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
@@ -151,3 +152,83 @@ TEST(CudaAtomic, float16Unalign) {
   TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
   TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
 }
+
+// https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
+template <typename T>
+static __forceinline__ __device__ T WarpReduceSum(T val) {
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+
+template <typename T>
+__forceinline__ __device__ T BlockReduce(T val) {
+  static __shared__ T shared[32];  // Shared mem for 32 partial sums
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+
+  val = WarpReduceSum(val);  // Each warp performs partial reduction
+
+  if (lane == 0) shared[wid] = val;  // Write reduced value to shared memory
+
+  __syncthreads();  // Wait for all partial reductions
+
+  // read from shared memory only if that warp existed
+  val =
+      (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast<T>(0);
+
+  if (wid == 0) val = WarpReduceSum(val);  // Final reduce within first warp
+
+  return val;
+}
+
+template <typename T>
+__global__ void DeviceReduceSum(T* in, T* out, size_t N) {
+  T sum(0);
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    sum += in[i];
+  }
+  sum = BlockReduce<T>(sum);
+  __syncthreads();
+  if (threadIdx.x == 0) out[blockIdx.x] = sum;
+}
+
+template <typename T>
+void TestReduce(size_t num, float atol = 0.01) {
+  T* in1;
+  T *d_in1, *d_in2;
+  size_t size = sizeof(T) * num;
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
+  in1 = reinterpret_cast<T*>(malloc(size));
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num; ++i) {
+    in1[i] = static_cast<T>(dist(engine));
+  }
+  auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+  DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
+  cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  // NOTE(dzhwinter): the float16 add has small underflow/overflow
+  // so we use EXPECT_NEAR to check the result.
+  EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
+  free(in1);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+TEST(CudaShuffleSync, float16) {
+  TestReduce<float>(10);
+  TestReduce<float>(1000);
+
+  // float16 will overflow or accumulate truncate errors in big size.
+  TestReduce<float16>(10);
+  TestReduce<float16>(100, /*atol error*/ 1.0);
+}
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 9da787a4073fa002f75154f7c4fba54e9ed8efa6..07159d4a12ef4b628f7705ed206d3334be46dfc8 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
 
 # There is no macOS version of NCCL.
-if (NOT APPLE)
+if (NOT APPLE AND NOT WIN32)
   list(APPEND CUDA_SRCS nccl.cc)
 endif()
 
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 25bcda7eedc1ef42f75fb8fd1439f0c8f55015c3..c7c533bd42859c374c4783d43ec4cdd34a6a994a 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -17,10 +17,10 @@
 #include <cublasXt.h>
 #include <cublas_v2.h>
 #include <cuda.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
 #include <type_traits>
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 77e46fa768b62c277d7b4027de7173e39a5672b4..0103e7a3accf88f3c83f109298010c3c9af3d549 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <cudnn.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index e8f4a82ef132be9e4ec3fb76f11766046a2ff638..b946f46e82af4b09fafff54765b899254a4ec1df 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -17,10 +17,10 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cupti.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 5b9e0820e0b319fe7a636a57a0029caf038b4db3..2daf1b4215ce1f7f771bbac72bfe103b0b941976 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -14,9 +14,9 @@ limitations under the License. */
 #pragma once
 
 #include <curand.h>
-#include <dlfcn.h>
 
 #include <mutex>  // NOLINT
+#include "paddle/fluid/platform/port.h"
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 17acefe8cde01809572e4c86cbdccfed9a477a51..15ad4a3b40b1ad13a10dd37449c6f6f3e2029df6 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include <dlfcn.h>
 #include <mkl.h>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
@@ -49,17 +49,27 @@ extern void* mklml_dso_handle;
 
 #define MKLML_ROUTINE_EACH(__macro) \
   __macro(cblas_sgemm);             \
-  __macro(cblas_saxpy);             \
-  __macro(cblas_scopy);             \
-  __macro(cblas_sgemv);             \
-  __macro(cblas_sgemm_batch);       \
   __macro(cblas_dgemm);             \
+  __macro(cblas_saxpy);             \
   __macro(cblas_daxpy);             \
+  __macro(cblas_scopy);             \
   __macro(cblas_dcopy);             \
+  __macro(cblas_sgemv);             \
   __macro(cblas_dgemv);             \
+  __macro(cblas_sgemm_alloc);       \
+  __macro(cblas_dgemm_alloc);       \
+  __macro(cblas_sgemm_pack);        \
+  __macro(cblas_dgemm_pack);        \
+  __macro(cblas_sgemm_compute);     \
+  __macro(cblas_dgemm_compute);     \
+  __macro(cblas_sgemm_free);        \
+  __macro(cblas_dgemm_free);        \
+  __macro(cblas_sgemm_batch);       \
   __macro(cblas_dgemm_batch);       \
   __macro(vsAdd);                   \
   __macro(vdAdd);                   \
+  __macro(vsMul);                   \
+  __macro(vdMul);                   \
   __macro(MKL_Set_Num_Threads)
 
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index 575516f81870fc9f7b92919ffc20a201cb5cbce8..331ca9908e126d5dbca830457281fbf88fc1df09 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -13,12 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#include <dlfcn.h>
 #include <nccl.h>
 
 #include <mutex>  // NOLINT
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index d157c1fda789b98f06ad069d2a9c4f421ff82dcd..18ed9956f1841874b27c2493e2f3e22fdfbf0448 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 #include "warpctc/include/ctc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 566485cd3c383640047d97f40b452735e8c8c171..a76ba75f9eeb8c3f42fbf7254f629b0960a8f2d8 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -14,9 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <dlfcn.h>     // for dladdr
-#include <execinfo.h>  // for backtrace
-
 #ifdef __GNUC__
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__
@@ -37,6 +34,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
 
@@ -44,7 +42,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
-#ifndef __APPLE__
+#if !defined(__APPLE__) and !defined(_WIN32)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
@@ -75,7 +73,7 @@ struct EnforceNotMet : public std::exception {
 
       sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
       sout << "PaddlePaddle Call Stacks: " << std::endl;
-
+#if !defined(_WIN32)
       void* call_stack[TRACE_STACK_LIMIT];
       auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
       auto symbols = backtrace_symbols(call_stack, size);
@@ -95,6 +93,9 @@ struct EnforceNotMet : public std::exception {
         }
       }
       free(symbols);
+#else
+      sout << "Windows not support stack backtrace yet.";
+#endif
       err_str_ = sout.str();
     }
   }
@@ -205,7 +206,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 #endif
 }
 
-#ifndef __APPLE__
+#if !defined(__APPLE__) and !defined(_WIN32)
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     ncclResult_t stat, const Args&... args) {
@@ -221,7 +222,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 #endif
   }
 }
-#endif  // __APPLE__
+#endif  // __APPLE__ and windows
 #endif  // PADDLE_WITH_CUDA
 
 template <typename T>
@@ -263,7 +264,8 @@ inline void throw_on_error(T e) {
  *    PADDLE_ENFORCE_EQ(a, b);
  *
  *    will raise an expression described as follows:
- *    "enforce a == b failed, 1 != 2" with detailed stack information.
+ *    "Enforce failed. Expected input a == b, but received a(1) != b(2)."
+ *      with detailed stack information.
  *
  *    extra messages is also supported, for example:
  *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
@@ -292,9 +294,10 @@ inline void throw_on_error(T e) {
 #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
   do {                                                                  \
     if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
-      PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP    \
-                   " %s\n%s",                                           \
-                   #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
+      PADDLE_THROW("Enforce failed. Expected %s " #__CMP                \
+                   " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
+                   #__VAL0, #__VAL1, #__VAL0,                           \
+                   paddle::string::to_string(__VAL0), #__VAL1,          \
                    paddle::string::to_string(__VAL1),                   \
                    paddle::string::Sprintf("" __VA_ARGS__));            \
     }                                                                   \
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 0e8684581a93f076b1a077cc52e966d3c88cf078..d52182965552e9ec945cb7d0b421d8addcb758e9 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -54,7 +54,9 @@ TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
     PADDLE_ENFORCE_EQ(a, 1 + 3);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4");
+    HasPrefix(
+        StringPiece(error.what()),
+        "Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + 3:4.");
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -67,7 +69,8 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     HasPrefix(StringPiece(error.what()),
-              "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match");
+              "Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + "
+              "3:4.\ntheir size not match");
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -84,8 +87,9 @@ TEST(ENFORCE_NE, FAIL) {
     PADDLE_ENFORCE_NE(1.0, 1UL);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-                          "enforce 1.0 != 1UL failed, 1 == 1"))
+    EXPECT_TRUE(HasPrefix(
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1.0 != 1UL, but received 1.0:1 == 1UL:1."))
         << error.what() << " does not have expected prefix";
   }
   EXPECT_TRUE(caught_exception);
@@ -98,8 +102,9 @@ TEST(ENFORCE_GT, FAIL) {
     PADDLE_ENFORCE_GT(1, 2UL);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+    EXPECT_TRUE(HasPrefix(
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -116,8 +121,9 @@ TEST(ENFORCE_GE, FAIL) {
     PADDLE_ENFORCE_GE(1, 2UL);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2"));
+    EXPECT_TRUE(HasPrefix(
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1 >= 2UL, but received 1:1 < 2UL:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -135,8 +141,9 @@ TEST(ENFORCE_LE, FAIL) {
     PADDLE_ENFORCE_GT(1, 2UL);
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
-    EXPECT_TRUE(
-        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+    EXPECT_TRUE(HasPrefix(
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -153,7 +160,8 @@ TEST(ENFORCE_LT, FAIL) {
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-                          "enforce 1UL < 0.12 failed, 1 >= 0.12"));
+                          "Enforce failed. Expected 1UL < 0.12, but "
+                          "received 1UL:1 >= 0.12:0.12."));
   }
   EXPECT_TRUE(caught_exception);
 }
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 4cee93f3a4224cb97327254cd1679021d197a1b1..126636d879213b1c8f242db8fbdf6a358a1d2da9 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -116,7 +116,8 @@ size_t GpuMaxChunkSize() {
   size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
                                           (total - reserving));
 
-  PADDLE_ENFORCE_LE(allocating, available);
+  PADDLE_ENFORCE_LE(allocating, available,
+                    "Insufficient GPU memory to allocation.");
 
   return allocating;
 }
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 10a3ad256b17ba41380cdc0377905d03188cbaa3..f6e9a52b275353c03c1f350719766922a97f6cb3 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -125,6 +125,11 @@ class MKLDNNHandler {
     return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
   }
 
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(
       const mkldnn::memory::desc& md, void* ptr) {
     return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0a2d29500e7afbe8a9a43f010d5fd2d0c560467
--- /dev/null
+++ b/paddle/fluid/platform/port.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+#if !defined(_WIN32)
+#include <dlfcn.h>     // for dladdr
+#include <execinfo.h>  // for backtrace
+#else
+#include <Shlwapi.h>
+#include <Windows.h>
+
+static void* dlsym(void* handle, const char* symbol_name) {
+  FARPROC found_symbol;
+  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
+
+  if (found_symbol == NULL) {
+    throw std::runtime_error(std::string(symbol_name) + " not found.");
+  }
+  return reinterpret_cast<void*>(found_symbol);
+}
+
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 89ca4f781273e99bbb83216c238dfc5c88c0a22b..d6a14b3305c5cf2d544f17f39a3812f7f75b8a76 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,19 +1,22 @@
+set(PYBIND_DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
+          )
+if(NOT WIN32)
+list(APPEND PYBIND_DEPS parallel_executor)
+endif()
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
-           parallel_executor
+      DEPS ${PYBIND_DEPS}
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
-           parallel_executor
+      DEPS ${PYBIND_DEPS}
       ${GLOB_OP_LIB})
-    if(NOT APPLE AND NOT ANDROID)
+    if(NOT APPLE AND NOT ANDROID AND NOT WIN32)
       target_link_libraries(paddle_pybind rt)
-    endif(NOT APPLE AND NOT ANDROID)
+    endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
   endif(WITH_AMD_GPU)
 
   cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index be623703c2480774bb04a6bc0c5b00b699d7bb16..c2137ec6d7df24251432a4dfb8fffc3d3f77194e 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -205,12 +205,7 @@ void BindBlockDesc(pybind11::module *m) {
 void BindVarDsec(pybind11::module *m) {
   pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
   var_desc
-      .def("name",
-           [](pd::VarDesc &self) {
-             pybind11::bytes name = self.Name();
-             return name;
-           },
-           pybind11::return_value_policy::reference)
+      .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference)
       .def("set_name", &pd::VarDesc::SetName)
       .def("set_shape", &pd::VarDesc::SetShape)
       .def("set_shapes", &pd::VarDesc::SetShapes)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 7127bb38f6ddf8a55c1741d1f0ef18c8d9067fba..67734659233515ca8110f4212a2b1553fe4e9d24 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -54,6 +54,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
+#include "pybind11/stl.h"
+
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
 
@@ -247,6 +249,7 @@ PYBIND11_PLUGIN(core) {
         self.set_rows(new_rows);
 #endif
            })
+      .def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); })
       .def("rows", [](SelectedRows &self) {
         auto rows = self.rows();
         std::vector<int64_t> new_rows;
@@ -593,8 +596,8 @@ All parameter, weight, gradient are variables in Paddle.
 
   // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy>(pe, "ExecutionStrategy")
-      .def(py::init())
+  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
+  exec_strategy.def(py::init())
       .def_property(
           "num_threads",
           [](const ExecutionStrategy &self) { return self.num_threads_; },
@@ -621,6 +624,16 @@ All parameter, weight, gradient are variables in Paddle.
           [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
             self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
           });
+  exec_strategy.def_property(
+      "use_experimental_executor",
+      [](const ExecutionStrategy &self) {
+        return self.type_ == ExecutionStrategy::kExperimental;
+      },
+      [](ExecutionStrategy &self, bool experimental) {
+        self.type_ = experimental ? ExecutionStrategy::kExperimental
+                                  : ExecutionStrategy::kDefault;
+      });
+
   py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");
 
   py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 1283de9d957a46b848c7bb6caf9c5f49398468e2..622a2d51049d164b6e8423e4054081f40f190cb9 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -54,7 +54,7 @@ function cpu_config() {
   if [ $platform == "Linux" ]; then
     ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
   elif [ $platform == "Darwin" ]; then
-    if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then
+    if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
       # HT is OFF
       ht=1
     fi
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 25900811509aee8b37fdaf09cf902ea2ae3eee57..9cdcb87df5dd1669066c204c86c269973df506f1 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -97,10 +97,11 @@ if(APPLE)
   if(NOT INSTALL_NAME_TOOL_EXECUTABLE)
     message(FATAL_ERROR "install_name_tool not found, please check.\n")
   endif()
-else(APPLE)
+endif()
+if(LINUX)
   find_program(PATCHELF_EXECUTABLE patchelf)
   if(NOT PATCHELF_EXECUTABLE)
     message(FATAL_ERROR "patchelf not found, please install it.\n"
             "For Ubuntu, the command is: apt-get install -y patchelf.")
   endif()
-endif(APPLE)
+endif(LINUX)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 241a07a35297e85763781a42696fd727733459a3..53746afdb25b34b69f89fe0927c877ace62d7d55 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -24,4 +24,5 @@ except ImportError:
 import paddle.reader
 import paddle.dataset
 import paddle.batch
+import paddle.compat
 batch = batch.batch
diff --git a/python/paddle/compat.py b/python/paddle/compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..50726b6fa1bbbde68a590c86db9344b8f02f79f2
--- /dev/null
+++ b/python/paddle/compat.py
@@ -0,0 +1,237 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import math
+
+__all__ = [
+    'long_type',
+    'to_text',
+    'to_bytes',
+    'round',
+    'floor_division',
+    'get_exception_message',
+]
+
+if six.PY2:
+    int_type = int
+    long_type = long
+else:
+    int_type = int
+    long_type = int
+
+
+#  str and bytes related functions
+def to_text(obj, encoding='utf-8', inplace=False):
+    """
+      All string in PaddlePaddle should be represented as a literal string.
+    This function will convert object to a literal string without any encoding.
+    Especially, if the object type is a list or set container, we will iterate
+    all items in the object and convert them to literal string.
+
+    In Python3:
+        Decode the bytes type object to str type with specific encoding
+
+    In Python2:
+        Decode the str type object to unicode type with specific encoding
+
+    Args:
+        obj(unicode|str|bytes|list|set) : The object to be decoded.
+        encoding(str) : The encoding format to decode a string
+        inplace(bool) : If we change the original object or we create a new one
+
+    Returns:
+        Decoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    if isinstance(obj, list):
+        if inplace:
+            for i in six.moves.xrange(len(obj)):
+                obj[i] = _to_text(obj[i], encoding)
+            return obj
+        else:
+            return [_to_text(item, encoding) for item in obj]
+    elif isinstance(obj, set):
+        if inplace:
+            for item in obj:
+                obj.remove(item)
+                obj.add(_to_text(item, encoding))
+            return obj
+        else:
+            return set([_to_text(item, encoding) for item in obj])
+    else:
+        return _to_text(obj, encoding)
+
+
+def _to_text(obj, encoding):
+    """
+    In Python3:
+        Decode the bytes type object to str type with specific encoding
+
+    In Python2:
+        Decode the str type object to unicode type with specific encoding,
+        or we just return the unicode string of object
+
+    Args:
+        obj(unicode|str|bytes) : The object to be decoded.
+        encoding(str) : The encoding format
+
+    Returns:
+        decoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    if isinstance(obj, six.binary_type):
+        return obj.decode(encoding)
+    elif isinstance(obj, six.text_type):
+        return obj
+    else:
+        return six.u(obj)
+
+
+def to_bytes(obj, encoding='utf-8', inplace=False):
+    """
+      All string in PaddlePaddle should be represented as a literal string.
+    This function will convert object to a bytes with specific encoding.
+    Especially, if the object type is a list or set container, we will iterate
+    all items in the object and convert them to bytes.
+
+    In Python3:
+        Encode the str type object to bytes type with specific encoding
+
+    In Python2:
+        Encode the unicode type object to str type with specific encoding,
+        or we just return the 8-bit string of object
+
+    Args:
+        obj(unicode|str|bytes|list|set) : The object to be encoded.
+        encoding(str) : The encoding format to encode a string
+        inplace(bool) : If we change the original object or we create a new one
+
+    Returns:
+        Decoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    if isinstance(obj, list):
+        if inplace:
+            for i in six.moves.xrange(len(obj)):
+                obj[i] = _to_bytes(obj[i], encoding)
+            return obj
+        else:
+            return [_to_bytes(item, encoding) for item in obj]
+    elif isinstance(obj, set):
+        if inplace:
+            for item in obj:
+                obj.remove(item)
+                obj.add(_to_bytes(item, encoding))
+            return obj
+        else:
+            return set([_to_bytes(item, encoding) for item in obj])
+    else:
+        return _to_bytes(obj, encoding)
+
+
+def _to_bytes(obj, encoding):
+    """
+    In Python3:
+        Encode the str type object to bytes type with specific encoding
+
+    In Python2:
+        Encode the unicode type object to str type with specific encoding,
+        or we just return the 8-bit string of object
+
+    Args:
+        obj(unicode|str|bytes) : The object to be encoded.
+        encoding(str) : The encoding format
+
+    Returns:
+        encoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    assert encoding is not None
+    if isinstance(obj, six.text_type):
+        return obj.encode(encoding)
+    elif isinstance(obj, six.binary_type):
+        return obj
+    else:
+        return six.b(obj)
+
+
+# math related functions
+def round(x, d=0):
+    """
+    Compatible round which act the same behaviour in Python3.
+
+    Args:
+        x(float) : The number to round halfway.
+
+    Returns:
+        round result of x
+    """
+    if six.PY3:
+        # The official walkaround of round in Python3 is incorrect
+        # we implement accroding this answer: https://www.techforgeek.info/round_python.html
+        if x > 0.0:
+            p = 10**d
+            return float(math.floor((x * p) + math.copysign(0.5, x))) / p
+        elif x < 0.0:
+            p = 10**d
+            return float(math.ceil((x * p) + math.copysign(0.5, x))) / p
+        else:
+            return math.copysign(0.0, x)
+    else:
+        import __builtin__
+        return __builtin__.round(x, d)
+
+
+def floor_division(x, y):
+    """
+    Compatible division which act the same behaviour in Python3 and Python2,
+    whose result will be a int value of floor(x / y) in Python3 and value of
+    (x / y) in Python2.
+
+    Args:
+        x(int|float) : The number to divide.
+        y(int|float) : The number to be divided
+
+    Returns:
+        division result of x // y
+    """
+    return x // y
+
+
+# exception related functions
+def get_exception_message(exc):
+    """
+    Get the error message of a specific exception
+
+    Args:
+        exec(Exception) : The exception to get error message.
+
+    Returns:
+        the error message of exec
+    """
+    assert exc is not None
+
+    if six.PY2:
+        return exc.message
+    else:
+        return str(exc)
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
index f6b4ff8fbd0f83b1d652d37c1b2d04efd3c73cbb..b83fa78c4c65357407b7f884f8c3fe8ef0ccaba8 100644
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -28,11 +28,13 @@ images per class.
 
 """
 
+from __future__ import print_function
+
 import itertools
 import numpy
 import paddle.dataset.common
 import tarfile
-from six.moves import zip
+import six
 from six.moves import cPickle as pickle
 
 __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
@@ -46,10 +48,11 @@ CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
 
 def reader_creator(filename, sub_name, cycle=False):
     def read_batch(batch):
-        data = batch['data']
-        labels = batch.get('labels', batch.get('fine_labels', None))
+        data = batch[six.b('data')]
+        labels = batch.get(
+            six.b('labels'), batch.get(six.b('fine_labels'), None))
         assert labels is not None
-        for sample, label in zip(data, labels):
+        for sample, label in six.moves.zip(data, labels):
             yield (sample / 255.0).astype(numpy.float32), int(label)
 
     def reader():
@@ -59,7 +62,11 @@ def reader_creator(filename, sub_name, cycle=False):
 
             while True:
                 for name in names:
-                    batch = pickle.load(f.extractfile(name))
+                    if six.PY2:
+                        batch = pickle.load(f.extractfile(name))
+                    else:
+                        batch = pickle.load(
+                            f.extractfile(name), encoding='bytes')
                     for item in read_batch(batch):
                         yield item
                 if not cycle:
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 6195cc50df338e83bea1f4ad416529464636a33e..1d7ff582c86a40c8c2086e0de16e89d69c94da60 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import requests
 import hashlib
 import os
@@ -85,10 +87,10 @@ def download(url, module_name, md5sum, save_name=None):
         total_length = r.headers.get('content-length')
 
         if total_length is None:
-            with open(filename, 'w') as f:
+            with open(filename, 'wb') as f:
                 shutil.copyfileobj(r.raw, f)
         else:
-            with open(filename, 'w') as f:
+            with open(filename, 'wb') as f:
                 dl = 0
                 total_length = int(total_length)
                 for data in r.iter_content(chunk_size=4096):
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
index a97c95d067b876a87f0aa19b2ddd0702a848bd4a..55cfd92721e95d66f1cf38e2f77d9bb6b9e17d7a 100644
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -20,15 +20,18 @@ dataset. And a pre-trained word vector model based on Wikipedia corpus is used
 to initialize SRL model.
 """
 
+from __future__ import print_function
+
 import tarfile
 import gzip
 import itertools
 import paddle.dataset.common
-from six.moves import zip
+import paddle.compat as cpt
+from six.moves import zip, range
 
 __all__ = ['test, get_dict', 'get_embedding', 'convert']
 
-DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
 WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
@@ -89,8 +92,8 @@ def corpus_reader(data_path, words_name, props_name):
             labels = []
             one_seg = []
             for word, label in zip(words_file, props_file):
-                word = word.strip()
-                label = label.strip().split()
+                word = cpt.to_text(word.strip())
+                label = cpt.to_text(label.strip().split())
 
                 if len(label) == 0:  # end of sentence
                     for i in range(len(one_seg[0])):
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 914dae348bc94d061072543aa14aba2219f4b52d..aa73bbaf7024ec873d9e921205536f12e097ff32 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -28,6 +28,9 @@ Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
 
 """
+
+from __future__ import print_function
+
 import itertools
 import functools
 from .common import download
@@ -116,7 +119,7 @@ def reader_creator(data_file,
             for file in open(file_list):
                 file = file.strip()
                 batch = None
-                with open(file, 'r') as f:
+                with open(file, 'rb') as f:
                     batch = pickle.load(f)
                 data = batch['data']
                 labels = batch['label']
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 3b3d89c93c48d611dccf6f14958c310a6cac1a7b..1cd50bd1802095db07e5618f37b0d42d11e94760 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -29,10 +29,18 @@ the image layout as follows.
   formats can be used for training. Noted that, the format should
   be keep consistent between the training and inference peroid.
 """
+
+from __future__ import print_function
+
 import numpy as np
 try:
     import cv2
 except ImportError:
+    import sys
+    sys.stderr.write(
+        '''Warning with paddle image module: opencv-python should be imported,
+    or paddle image module could NOT work; please install opencv-python first.'''
+    )
     cv2 = None
 import os
 import tarfile
@@ -56,7 +64,7 @@ def batch_images_from_tar(data_file,
     :type data_file: string
     :param dataset_name: 'train','test' or 'valid'
     :type dataset_name: string
-    :param img2label: a dic with image file name as key 
+    :param img2label: a dic with image file name as key
                     and image's label as value
     :type img2label: dic
     :param num_per_batch: image number per batch file
@@ -88,7 +96,7 @@ def batch_images_from_tar(data_file,
                 output['data'] = data
                 pickle.dump(
                     output,
-                    open('%s/batch_%d' % (out_path, file_id), 'w'),
+                    open('%s/batch_%d' % (out_path, file_id), 'wb'),
                     protocol=pickle.HIGHEST_PROTOCOL)
                 file_id += 1
                 data = []
@@ -99,7 +107,7 @@ def batch_images_from_tar(data_file,
         output['data'] = data
         pickle.dump(
             output,
-            open('%s/batch_%d' % (out_path, file_id), 'w'),
+            open('%s/batch_%d' % (out_path, file_id), 'wb'),
             protocol=pickle.HIGHEST_PROTOCOL)
 
     with open(meta_file, 'a') as meta:
@@ -113,7 +121,7 @@ def load_image_bytes(bytes, is_color=True):
     Load an color or gray image from bytes array.
 
     Example usage:
-    
+
     .. code-block:: python
 
         with open('cat.jpg') as f:
@@ -126,6 +134,8 @@ def load_image_bytes(bytes, is_color=True):
                      load and return a gray image.
     :type is_color: bool
     """
+    assert cv2 is not None
+
     flag = 1 if is_color else 0
     file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
     img = cv2.imdecode(file_bytes, flag)
@@ -137,7 +147,7 @@ def load_image(file, is_color=True):
     Load an color or gray image from the file path.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = load_image('cat.jpg')
@@ -149,6 +159,8 @@ def load_image(file, is_color=True):
                      load and return a gray image.
     :type is_color: bool
     """
+    assert cv2 is not None
+
     # cv2.IMAGE_COLOR for OpenCV3
     # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
     # cv2.IMAGE_GRAYSCALE for OpenCV3
@@ -161,27 +173,29 @@ def load_image(file, is_color=True):
 
 
 def resize_short(im, size):
-    """ 
+    """
     Resize an image so that the length of shorter edge is size.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
-    
+
     :param im: the input image with HWC layout.
     :type im: ndarray
     :param size: the shorter edge size of image after resizing.
     :type size: int
     """
+    assert cv2 is not None
+
     h, w = im.shape[:2]
     h_new, w_new = size, size
     if h > w:
-        h_new = size * h / w
+        h_new = size * h // w
     else:
-        w_new = size * w / h
+        w_new = size * w // h
     im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
     return im
 
@@ -193,17 +207,17 @@ def to_chw(im, order=(2, 0, 1)):
     according the order (2,0,1).
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
         im = to_chw(im)
-    
+
     :param im: the input image with HWC layout.
     :type im: ndarray
     :param order: the transposed order.
-    :type order: tuple|list 
+    :type order: tuple|list
     """
     assert len(im.shape) == len(order)
     im = im.transpose(order)
@@ -215,11 +229,11 @@ def center_crop(im, size, is_color=True):
     Crop the center of image with size.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = center_crop(im, 224)
-    
+
     :param im: the input image with HWC layout.
     :type im: ndarray
     :param size: the cropping size.
@@ -228,8 +242,8 @@ def center_crop(im, size, is_color=True):
     :type is_color: bool
     """
     h, w = im.shape[:2]
-    h_start = (h - size) / 2
-    w_start = (w - size) / 2
+    h_start = (h - size) // 2
+    w_start = (w - size) // 2
     h_end, w_end = h_start + size, w_start + size
     if is_color:
         im = im[h_start:h_end, w_start:w_end, :]
@@ -243,11 +257,11 @@ def random_crop(im, size, is_color=True):
     Randomly crop input image with size.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = random_crop(im, 224)
-    
+
     :param im: the input image with HWC layout.
     :type im: ndarray
     :param size: the cropping size.
@@ -272,11 +286,11 @@ def left_right_flip(im, is_color=True):
     Return the flipped image.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = left_right_flip(im)
-    
+
     :param im: input image with HWC layout or HW layout for gray image
     :type im: ndarray
     :param is_color: whether input image is color or not
@@ -299,7 +313,7 @@ def simple_transform(im,
     resizing, croping and flipping.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = simple_transform(im, 256, 224, True)
@@ -314,7 +328,7 @@ def simple_transform(im,
     :type is_train: bool
     :param is_color: whether the image is color or not.
     :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
+    :param mean: the mean values, which can be element-wise mean values or
                  mean values per channel.
     :type mean: numpy array | list
     """
@@ -332,7 +346,7 @@ def simple_transform(im,
     im = im.astype('float32')
     if mean is not None:
         mean = np.array(mean, dtype=np.float32)
-        # mean value, may be one value per channel 
+        # mean value, may be one value per channel
         if mean.ndim == 1 and is_color:
             mean = mean[:, np.newaxis, np.newaxis]
         elif mean.ndim == 1:
@@ -357,7 +371,7 @@ def load_and_transform(filename,
     for the transform operations.
 
     Example usage:
-    
+
     .. code-block:: python
 
         im = load_and_transform('cat.jpg', 256, 224, True)
@@ -372,7 +386,7 @@ def load_and_transform(filename,
     :type is_train: bool
     :param is_color: whether the image is color or not.
     :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
+    :param mean: the mean values, which can be element-wise mean values or
                  mean values per channel.
     :type mean: numpy array | list
     """
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
index e7fe4e0b7e5832c2bc7ca1307725936a70292c39..fd92523a947689a71b6f9371a3ef4838eb9d194d 100644
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -20,11 +20,14 @@ of 25,000 highly polar movie reviews for training, and 25,000 for testing.
 Besides, this module also provides API for building dictionary.
 """
 
+from __future__ import print_function
+
 import paddle.dataset.common
 import collections
 import tarfile
 import re
 import string
+import six
 
 __all__ = ['build_dict', 'train', 'test', 'convert']
 
@@ -42,13 +45,14 @@ def tokenize(pattern):
         # sequential access of member files, other than
         # tarfile.extractfile, which does random access and might
         # destroy hard disks.
-        tf = next(tarf)
+        tf = tarf.next()
         while tf != None:
             if bool(pattern.match(tf.name)):
                 # newline and punctuations removal and ad-hoc tokenization.
-                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
-                    None, string.punctuation).lower().split()
-            tf = next(tarf)
+                yield tarf.extractfile(tf).read().rstrip(six.b(
+                    "\n\r")).translate(
+                        None, six.b(string.punctuation)).lower().split()
+            tf = tarf.next()
 
 
 def build_dict(pattern, cutoff):
@@ -62,11 +66,11 @@ def build_dict(pattern, cutoff):
             word_freq[word] += 1
 
     # Not sure if we should prune less-frequent words here.
-    word_freq = [x for x in list(word_freq.items()) if x[1] > cutoff]
+    word_freq = [x for x in six.iteritems(word_freq) if x[1] > cutoff]
 
     dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
     words, _ = list(zip(*dictionary))
-    word_idx = dict(list(zip(words, list(range(len(words))))))
+    word_idx = dict(list(zip(words, six.moves.range(len(words)))))
     word_idx['<unk>'] = len(words)
     return word_idx
 
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
index bc007c9d3c8e2f1e4ff091f7c2c93eacbbe8d0e0..8eecb75231de450282fa4838aca5b293cc2101d1 100644
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -14,13 +14,17 @@
 """
 imikolov's simple dataset.
 
-This module will download dataset from 
+This module will download dataset from
 http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
 into paddle reader creators.
 """
+
+from __future__ import print_function
+
 import paddle.dataset.common
 import collections
 import tarfile
+import six
 
 __all__ = ['train', 'test', 'build_dict', 'convert']
 
@@ -64,11 +68,13 @@ def build_dict(min_word_freq=50):
             # remove <unk> for now, since we will set it as last index
             del word_freq['<unk>']
 
-        word_freq = [x for x in list(word_freq.items()) if x[1] > min_word_freq]
+        word_freq = [
+            x for x in six.iteritems(word_freq) if x[1] > min_word_freq
+        ]
 
         word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
         words, _ = list(zip(*word_freq_sorted))
-        word_idx = dict(list(zip(words, list(range(len(words))))))
+        word_idx = dict(list(zip(words, six.moves.range(len(words)))))
         word_idx['<unk>'] = len(words)
 
     return word_idx
@@ -89,7 +95,7 @@ def reader_creator(filename, word_idx, n, data_type):
                     l = ['<s>'] + l.strip().split() + ['<e>']
                     if len(l) >= n:
                         l = [word_idx.get(w, UNK) for w in l]
-                        for i in range(n, len(l) + 1):
+                        for i in six.moves.range(n, len(l) + 1):
                             yield tuple(l[i - n:i])
                 elif DataType.SEQ == data_type:
                     l = l.strip().split()
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index ffa9008c80129b80b3807dbab37bc198e59cf5a2..38addd0cfd9bd0afde7eefc57f2111b717b7e636 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -17,10 +17,15 @@ MNIST dataset.
 This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
 parse training set and test set into paddle reader creators.
 """
+
+from __future__ import print_function
+
 import paddle.dataset.common
 import subprocess
 import numpy
 import platform
+import tempfile
+from six.moves import range
 __all__ = ['train', 'test', 'convert']
 
 URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
@@ -45,23 +50,28 @@ def reader_creator(image_filename, label_filename, buffer_size):
 
         # According to http://stackoverflow.com/a/38061619/724872, we
         # cannot use standard package gzip here.
-        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
-        m.stdout.read(16)  # skip some magic bytes
+        tmp_image_file = tempfile.TemporaryFile(prefix='paddle_dataset')
+        m = subprocess.Popen(
+            [zcat_cmd, image_filename], stdout=tmp_image_file).communicate()
+        tmp_image_file.seek(16)  # skip some magic bytes
 
-        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
-        l.stdout.read(8)  # skip some magic bytes
+        # Python3 will not take stdout as file
+        tmp_label_file = tempfile.TemporaryFile(prefix='paddle_dataset')
+        l = subprocess.Popen(
+            [zcat_cmd, label_filename], stdout=tmp_label_file).communicate()
+        tmp_label_file.seek(8)  # skip some magic bytes
 
         try:  # reader could be break.
             while True:
                 labels = numpy.fromfile(
-                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+                    tmp_label_file, 'ubyte', count=buffer_size).astype("int")
 
                 if labels.size != buffer_size:
                     break  # numpy.fromfile returns empty slice after EOF.
 
                 images = numpy.fromfile(
-                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
-                        (buffer_size, 28 * 28)).astype('float32')
+                    tmp_image_file, 'ubyte', count=buffer_size * 28 *
+                    28).reshape((buffer_size, 28 * 28)).astype('float32')
 
                 images = images / 255.0 * 2.0 - 1.0
 
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
index 056ec2178607329dd6daa1764820c2312bbaed59..c98e0019f7ab5fc2723e8df919257a59af7c9e5d 100644
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -22,11 +22,15 @@ set and test set into paddle reader creators.
 
 """
 
+from __future__ import print_function
+
 import zipfile
 import paddle.dataset.common
 import re
 import random
 import functools
+import six
+import paddle.compat as cpt
 
 __all__ = [
     'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
@@ -112,6 +116,7 @@ def __initialize_meta_info__():
                 categories_set = set()
                 with package.open('ml-1m/movies.dat') as movie_file:
                     for i, line in enumerate(movie_file):
+                        line = cpt.to_text(line, encoding='latin')
                         movie_id, title, categories = line.strip().split('::')
                         categories = categories.split('|')
                         for c in categories:
@@ -136,6 +141,7 @@ def __initialize_meta_info__():
                 USER_INFO = dict()
                 with package.open('ml-1m/users.dat') as user_file:
                     for line in user_file:
+                        line = cpt.to_text(line, encoding='latin')
                         uid, gender, age, job, _ = line.strip().split("::")
                         USER_INFO[int(uid)] = UserInfo(
                             index=uid, gender=gender, age=age, job_id=job)
@@ -148,6 +154,7 @@ def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
     with zipfile.ZipFile(file=fn) as package:
         with package.open('ml-1m/ratings.dat') as rating:
             for line in rating:
+                line = cpt.to_text(line, encoding='latin')
                 if (rand.random() < test_ratio) == is_test:
                     uid, mov_id, rating, _ = line.strip().split("::")
                     uid = int(uid)
@@ -187,7 +194,7 @@ def max_movie_id():
     Get the maximum value of movie id.
     """
     __initialize_meta_info__()
-    return reduce(__max_index_info__, list(MOVIE_INFO.values())).index
+    return six.moves.reduce(__max_index_info__, list(MOVIE_INFO.values())).index
 
 
 def max_user_id():
@@ -195,7 +202,7 @@ def max_user_id():
     Get the maximum value of user id.
     """
     __initialize_meta_info__()
-    return reduce(__max_index_info__, list(USER_INFO.values())).index
+    return six.moves.reduce(__max_index_info__, list(USER_INFO.values())).index
 
 
 def __max_job_id_impl__(a, b):
@@ -210,7 +217,8 @@ def max_job_id():
     Get the maximum value of job id.
     """
     __initialize_meta_info__()
-    return reduce(__max_job_id_impl__, list(USER_INFO.values())).job_id
+    return six.moves.reduce(__max_job_id_impl__,
+                            list(USER_INFO.values())).job_id
 
 
 def movie_categories():
diff --git a/python/paddle/dataset/mq2007.py b/python/paddle/dataset/mq2007.py
index cc4d088316dfd490dc9d6b247c66c2495cedf2c3..d5740f30c898d5704636e1de9b2e1137d12e3c35 100644
--- a/python/paddle/dataset/mq2007.py
+++ b/python/paddle/dataset/mq2007.py
@@ -23,6 +23,8 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
 
 """
 
+from __future__ import print_function
+
 import os
 import functools
 import rarfile
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
index 953ada057bc114ebbfe39011d2fd3b5b7a2b0d37..22d867beea25c97efcbcb6f61ca2b7a7777f9c5c 100644
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -20,6 +20,9 @@ The script fetch and preprocess movie_reviews data set that provided by NLTK
 TODO(yuyang18): Complete dataset.
 """
 
+from __future__ import print_function
+
+import six
 import collections
 from itertools import chain
 
@@ -64,7 +67,7 @@ def get_word_dict():
         for field in movie_reviews.fileids(category):
             for words in movie_reviews.words(field):
                 word_freq_dict[words] += 1
-    words_sort_list = list(word_freq_dict.items())
+    words_sort_list = six.iteritems(word_freq_dict)
     words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
     for index, word in enumerate(words_sort_list):
         words_freq_sorted.append((word[0], index))
diff --git a/python/paddle/dataset/tests/cifar_test.py b/python/paddle/dataset/tests/cifar_test.py
index 839125b09dd5c6432e3572374a7345a77a43f7cf..8e514f0fd9a18a7d512430111a8a11b942950d20 100644
--- a/python/paddle/dataset/tests/cifar_test.py
+++ b/python/paddle/dataset/tests/cifar_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.cifar
 import unittest
 
diff --git a/python/paddle/dataset/tests/common_test.py b/python/paddle/dataset/tests/common_test.py
index 777cd06a19726f8ad73774c958c8cb512808a3aa..0ce7d83f374f8c09f68527473418de8ce84c36b1 100644
--- a/python/paddle/dataset/tests/common_test.py
+++ b/python/paddle/dataset/tests/common_test.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.common
 import unittest
 import tempfile
 import glob
+from six.moves import range
 
 
 class TestCommon(unittest.TestCase):
diff --git a/python/paddle/dataset/tests/flowers_test.py b/python/paddle/dataset/tests/flowers_test.py
index 06260fd796ce0271b7cec2f42a8a5a255a02dc24..06a0a7761cfa10ca3211297d176e3e909332e271 100644
--- a/python/paddle/dataset/tests/flowers_test.py
+++ b/python/paddle/dataset/tests/flowers_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.flowers
 import unittest
 
diff --git a/python/paddle/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py
index 539da049449cd273db0a9e260851ed40e1be0f04..415947e3477f2e5b9979588528f7cb6f799acf6a 100644
--- a/python/paddle/dataset/tests/imdb_test.py
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.imdb
 import unittest
 import re
diff --git a/python/paddle/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
index 50f50d947d221686d6308a6ed44cbcff3b10c6f5..1f78a5dd4d1a09c3192bc8c144c5a78c8a214f3a 100644
--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.imikolov
 import unittest
 
diff --git a/python/paddle/dataset/tests/mnist_test.py b/python/paddle/dataset/tests/mnist_test.py
index 8ada19d3f2ee13e194d08e19a4b86b558c69a0a7..fbb5d926494e38283e78ec15381530e50f32915d 100644
--- a/python/paddle/dataset/tests/mnist_test.py
+++ b/python/paddle/dataset/tests/mnist_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.mnist
 import unittest
 
diff --git a/python/paddle/dataset/tests/mq2007_test.py b/python/paddle/dataset/tests/mq2007_test.py
index fba388724a8e84591df7150b41f8ea39a850fc31..ee0897e88f0d7ad089b7f7b68d31d04d96fa3e9d 100644
--- a/python/paddle/dataset/tests/mq2007_test.py
+++ b/python/paddle/dataset/tests/mq2007_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.mq2007
 import unittest
 
diff --git a/python/paddle/dataset/tests/test_image.py b/python/paddle/dataset/tests/test_image.py
index 8bd56607ae1998935a3b3aaa0e3279515c2a540c..32d2eb17ae673e72bbee2fc3bb5e3b05f1b20074 100644
--- a/python/paddle/dataset/tests/test_image.py
+++ b/python/paddle/dataset/tests/test_image.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
index 37326517f7b39fb74c694684eb8a547d5f021946..bb9830132e987370022df3192060de3e908a2e85 100644
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -15,6 +15,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import nltk
 import paddle.dataset.sentiment as st
diff --git a/python/paddle/dataset/tests/voc2012_test.py b/python/paddle/dataset/tests/voc2012_test.py
index 0d285461a8ae8a9cc69fbec0dcf5efc106b594f0..cddeb91cab2c0f90567f28f8258156e2bb654abc 100644
--- a/python/paddle/dataset/tests/voc2012_test.py
+++ b/python/paddle/dataset/tests/voc2012_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.voc2012
 import unittest
 
diff --git a/python/paddle/dataset/tests/wmt16_test.py b/python/paddle/dataset/tests/wmt16_test.py
index 8b949d8bf5212d51016a33da322095bde2038200..be121bb10121967590c9e136e9a1964a133e934b 100644
--- a/python/paddle/dataset/tests/wmt16_test.py
+++ b/python/paddle/dataset/tests/wmt16_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.wmt16
 import unittest
 
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index 410ca7af0d6d1dc26acbf92fce5e49fce7d3a3bb..f87fdcc4f0f3c42a92bcff5ddcd532c3108565b1 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -19,9 +19,10 @@ https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
 parse training set and test set into paddle reader creators.
 """
 
-import os
+from __future__ import print_function
 
 import numpy as np
+import six
 import tempfile
 import tarfile
 import os
@@ -70,11 +71,11 @@ def load_data(filename, feature_num=14, ratio=0.8):
         return
 
     data = np.fromfile(filename, sep=' ')
-    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    data = data.reshape(data.shape[0] // feature_num, feature_num)
     maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
         axis=0) / data.shape[0]
     feature_range(maximums[:-1], minimums[:-1])
-    for i in range(feature_num - 1):
+    for i in six.moves.range(feature_num - 1):
         data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
     offset = int(data.shape[0] * ratio)
     UCI_TRAIN_DATA = data[:offset]
@@ -137,7 +138,7 @@ def predict_reader():
     It returns just one tuple data to do inference.
 
     :return: one tuple data
-    :rtype: tuple 
+    :rtype: tuple
     """
     global UCI_TEST_DATA
     load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
diff --git a/python/paddle/dataset/voc2012.py b/python/paddle/dataset/voc2012.py
index 9c945574dbcc15f5cee370206ed7e70ba8ab5014..50688937654ae72b77e1439f21a0d7c847d5e135 100644
--- a/python/paddle/dataset/voc2012.py
+++ b/python/paddle/dataset/voc2012.py
@@ -19,6 +19,8 @@ to training/test sets has been maintained. The total number of images
 with segmentation has been increased from 7,062 to 9,993.
 """
 
+from __future__ import print_function
+
 import tarfile
 import io
 import numpy as np
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index 7504474591fa486428d0310f10387818c4cf0300..f8c1a33574e642b21feb6843d115b7f4205ef250 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -19,10 +19,15 @@ http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
 parse training set and test set into paddle reader creators.
 
 """
+
+from __future__ import print_function
+
+import six
 import tarfile
 import gzip
 
 import paddle.dataset.common
+import paddle.compat as cpt
 
 __all__ = [
     'train',
@@ -53,7 +58,7 @@ def __read_to_dict(tar_file, dict_size):
         out_dict = dict()
         for line_count, line in enumerate(fd):
             if line_count < size:
-                out_dict[line.strip()] = line_count
+                out_dict[cpt.to_text(line.strip())] = line_count
             else:
                 break
         return out_dict
@@ -84,7 +89,7 @@ def reader_creator(tar_file, file_name, dict_size):
             ]
             for name in names:
                 for line in f.extractfile(name):
-                    line_split = line.strip().split('\t')
+                    line_split = line.strip().split(six.b('\t'))
                     if len(line_split) != 2:
                         continue
                     src_seq = line_split[0]  # one source sequence
@@ -153,8 +158,8 @@ def get_dict(dict_size, reverse=True):
     tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
     src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
     if reverse:
-        src_dict = {v: k for k, v in list(src_dict.items())}
-        trg_dict = {v: k for k, v in list(trg_dict.items())}
+        src_dict = {v: k for k, v in six.iteritems(src_dict)}
+        trg_dict = {v: k for k, v in six.iteritems(trg_dict)}
     return src_dict, trg_dict
 
 
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 4e3c466c38e402cc574e93ef3a5935edf8f9dd3b..f30dcd518ea6c0c685d027ede3ad6e0a1cb0c82c 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -28,12 +28,16 @@ Multi30K: Multilingual English-German Image Descriptions.
 }
 """
 
+from __future__ import print_function
+
 import os
+import six
 import tarfile
 import gzip
 from collections import defaultdict
 
 import paddle.dataset.common
+import paddle.compat as cpt
 
 __all__ = [
     "train",
@@ -60,7 +64,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
     word_dict = defaultdict(int)
     with tarfile.open(tar_file, mode="r") as f:
         for line in f.extractfile("wmt16/train"):
-            line_split = line.strip().split("\t")
+            line_split = line.strip().split(six.b("\t"))
             if len(line_split) != 2: continue
             sen = line_split[0] if lang == "en" else line_split[1]
             for w in sen.split():
@@ -70,8 +74,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
         fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
         for idx, word in enumerate(
                 sorted(
-                    iter(list(word_dict.items())),
-                    key=lambda x: x[1],
+                    six.iteritems(word_dict), key=lambda x: x[1],
                     reverse=True)):
             if idx + 3 == dict_size: break
             fout.write("%s\n" % (word[0]))
@@ -81,16 +84,16 @@ def __load_dict(tar_file, dict_size, lang, reverse=False):
     dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
                              "wmt16/%s_%d.dict" % (lang, dict_size))
     if not os.path.exists(dict_path) or (
-            len(open(dict_path, "r").readlines()) != dict_size):
+            len(open(dict_path, "rb").readlines()) != dict_size):
         __build_dict(tar_file, dict_size, dict_path, lang)
 
     word_dict = {}
-    with open(dict_path, "r") as fdict:
+    with open(dict_path, "rb") as fdict:
         for idx, line in enumerate(fdict):
             if reverse:
-                word_dict[idx] = line.strip()
+                word_dict[idx] = cpt.to_text(line.strip())
             else:
-                word_dict[line.strip()] = idx
+                word_dict[cpt.to_text(line.strip())] = idx
     return word_dict
 
 
@@ -120,7 +123,7 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
 
         with tarfile.open(tar_file, mode="r") as f:
             for line in f.extractfile(file_name):
-                line_split = line.strip().split("\t")
+                line_split = line.strip().split(six.b("\t"))
                 if len(line_split) != 2:
                     continue
                 src_words = line_split[src_col].split()
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
index 358e24df31bb517604481bb48b9180e579f8460d..42cd3b36420ef5a17a9a7d981978ba8869809936 100644
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import warnings
 """
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index fd6a76dd0cfa347328d87093884e5cd324395497..a415cdbeaaae2a3bb4a137744205e3fe7366a78f 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from paddle.fluid import framework as framework
 from . import core
 import collections
 import copy
 import six
+from .. import compat as cpt
 from . import unique_name
 
 __all__ = ['append_backward']
@@ -45,13 +48,13 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
     """
     op_desc = core.OpDesc()
     op_desc.set_type(op_type)
-    for para, args in list(inputs.items()):
+    for para, args in six.iteritems(inputs):
         op_desc.set_input(
             para,
             list(
                 map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
                     args)))
-    for para, args in list(outputs.items()):
+    for para, args in six.iteritems(outputs):
         op_desc.set_output(
             para,
             list(
@@ -63,7 +66,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
     if op_role_attr_name not in attrs:
         attrs[
             op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
-    for name, val in list(attrs.items()):
+    for name, val in six.iteritems(attrs):
         if isinstance(val, framework.Block):
             op_desc.set_block_attr(name, val.desc)
         else:
@@ -75,10 +78,10 @@ def _infer_var_data_type_(grad_var_name, block):
     """
     Infer the data type of given grad variable
     """
-    grad_var = block.desc.find_var(grad_var_name.encode("ascii"))
-    fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii"))
-    if block.desc.has_var_recursive(fwd_name):
-        fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
+    grad_var = block.desc.find_var(cpt.to_bytes(grad_var_name))
+    fwd_name = _strip_grad_suffix_(grad_var_name)
+    if block.desc.has_var_recursive(cpt.to_bytes(fwd_name)):
+        fwd_var = block.desc.find_var_recursive(cpt.to_bytes(fwd_name))
         grad_var.set_dtype(fwd_var.dtype())
     else:
         grad_var.set_dtype(core.VarDesc.VarType.FP32)
@@ -102,8 +105,10 @@ def _some_in_set_(cands, s):
     """
     if len(cands) == 0:
         return False
-    for c in cands:
-        if c in s:
+    literal_set = cpt.to_text(s)
+    literal_cands = cpt.to_text(cands)
+    for c in literal_cands:
+        if c in literal_set:
             return True
     return False
 
@@ -114,9 +119,8 @@ def _strip_grad_suffix_(name):
     e.g. x@GRAD ==> x
          y@GRAD@RENAME@1 ==> y
     """
-    if isinstance(name, six.text_type):
-        name = name.encode()
-    pos = name.find(six.b(core.grad_var_suffix()))
+    name = cpt.to_text(name)
+    pos = name.find(core.grad_var_suffix())
     return name[:pos] if pos != -1 else name
 
 
@@ -125,9 +129,7 @@ def _append_grad_suffix_(name):
     Append grad suffix to the given variable name
     e.g. x ==> x@GRAD
     """
-    if isinstance(name, six.text_type):
-        name = name.encode()
-    return name + six.b(core.grad_var_suffix())
+    return cpt.to_text(name) + core.grad_var_suffix()
 
 
 def _addup_repetitive_outputs_(op_descs):
@@ -187,7 +189,7 @@ def _addup_repetitive_outputs_(op_descs):
                     op_desc.set_output(param_name, arg_names)
                     renamed_vars[var_name].append(new_name)
 
-    for var_name, inputs in list(renamed_vars.items()):
+    for var_name, inputs in six.iteritems(renamed_vars):
         if len(inputs) > 1:
             pending_sum_ops.append(
                 (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
@@ -243,7 +245,7 @@ from .proto import framework_pb2
 
 def serialize_op_decs(op_desc):
     protostr = op_desc.serialize_to_string()
-    proto = framework_pb2.OpDesc.FromString(str(protostr))
+    proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
     return proto.__str__()
 
 
@@ -364,7 +366,7 @@ def _append_backward_ops_(block,
 
         # Getting op's corresponding grad_op
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-            op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+            op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
 
         grad_op_descs.extend(grad_op_desc)
         grad_to_var.update(op_grad_to_var)
@@ -411,11 +413,10 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
         new_vars = set()
         # create new gradient variables
         for grad_var_name in op_desc.output_arg_names():
-            grad_var_name = grad_var_name.encode("ascii")
-            if block.desc.has_var_recursive(
-                    grad_var_name) or grad_var_name == core.empty_var_name():
+            if block.desc.has_var_recursive(cpt.to_bytes(
+                    grad_var_name)) or grad_var_name == core.empty_var_name():
                 continue
-            block.desc.var(grad_var_name)
+            block.desc.var(cpt.to_bytes(grad_var_name))
             new_vars.add(grad_var_name)
             if grad_var_name not in grad_to_var:
                 continue
@@ -445,7 +446,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
                 op_desc.rename_output(name, new_name)
                 var_map[name] = new_name
 
-    for g, ng in list(var_map.items()):
+    for g, ng in six.iteritems(var_map):
         if g in grad_to_var:
             grad_to_var[ng] = grad_to_var[g]
             grad_to_var.pop(g)
@@ -595,11 +596,12 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
         parameters = parameter_list
     else:
         params = program.global_block().all_parameters()
+        program.global_block().iter_parameters()
         parameters = [param.name for param in params]
 
     params_and_grads = []
     for param in parameters:
-        if param not in grad_info_map:
+        if cpt.to_text(param) not in grad_info_map:
             continue
         grad_info = grad_info_map[param]
         grad_block = grad_info[1]
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 4b0a792f784fffcce3f911d3e7448b472d39f8e1..ba7ba3b5e983bfbaa82fc752f4821e8a934dfb8c 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import copy
 import six
 
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index 676a52a917dd1f9700ec38de32932938ec339be5..b4a06f23a6f2713b665bdd42919925e4a0475a82 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from .layers.control_flow import BlockGuard, equal
 from .framework import Operator
 from .layer_helper import LayerHelper, unique_name
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 58f2da1c3ba2f84602e7a18c7b1c78d1f0d2ede1..5607f11932bbe6aff548be316dc39b4636e079f4 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from . import decoder
 from .decoder import *
 from . import memory_usage_calc
diff --git a/python/paddle/fluid/contrib/decoder/__init__.py b/python/paddle/fluid/contrib/decoder/__init__.py
index 6343c1543d206f82e605c5c986fa91d70c467113..9f973fd3c9af60a0c9a2ba5225a616671545436b 100644
--- a/python/paddle/fluid/contrib/decoder/__init__.py
+++ b/python/paddle/fluid/contrib/decoder/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from . import beam_search_decoder
 from .beam_search_decoder import *
 
diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
index d268a948f7a2cf038a419c95521b81088ed8215f..f2b7ac8375af25beed562b8279b6044f11c09d44 100644
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -20,6 +20,8 @@ without using the low level API such as while ops.
 This API is still under active development and may change drastically.
 """
 
+from __future__ import print_function
+
 import contextlib
 import numpy as np
 import six
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
index 5da846edb63c28efd791fdfac4046cfa56c24181..09721e430b7e5bb6b9891d5272ca54475baf6157 100644
--- a/python/paddle/fluid/contrib/memory_usage_calc.py
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -14,12 +14,16 @@
 """
 This module privides a memory usage calculate function for user.
 The purpose of this API is to allow users to estimate memory usage of
-a program under a special batch size, then user can set appropriate 
-batch size to fully utilize a GPU. 
+a program under a special batch size, then user can set appropriate
+batch size to fully utilize a GPU.
 
 This API is still under active development and may change drastically.
 """
 
+from __future__ import print_function
+
+import six
+
 from .. import core
 from ..framework import Program, Variable
 
@@ -45,15 +49,15 @@ def memory_usage(program, batch_size):
 
     Args:
         program(Program): The current Program.
-        batch_size(int): The current input data batch_size.  
-    
+        batch_size(int): The current input data batch_size.
+
     Returns:
         min_total_memory(float): the estimate memory usage lower bound.
         max_total_memory(float): the estimate memory usage upper bound.
         unit_str(string): the unit of estimate usage result.
-    
+
     Examples:
-        
+
         >>> import paddle.fluid as fluid
         >>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
                 fluid.default_main_program(), batch_size=10)
@@ -72,7 +76,7 @@ def memory_usage(program, batch_size):
 
     # Get the var_name list of first block and calculate
     total_memory = 0.0
-    for var in program.global_block().vars.itervalues():
+    for var in six.itervalues(program.global_block().vars):
         data_count = 1
         for x in var.shape:
             if x == -1:
@@ -81,10 +85,10 @@ def memory_usage(program, batch_size):
                 data_count *= x
         var_memory = data_count * dtype_to_size[var.dtype]
         if DEBUG:
-            print "%s memory usage: %d" % (var.name, var_memory)
+            print("%s memory usage: %d" % (var.name, var_memory))
         total_memory += var_memory
     if DEBUG:
-        print "total memory usage: %.2f" % (total_memory)
+        print("total memory usage: %.2f" % (total_memory))
 
     # Convert appropriate unit
     unit_str = "B"
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 9452cf0e2a3a2eddb761149466bfc1ee3d23dfd9..631bbfe1fe59ddd9cd315fb64ca32e1e125b0e8d 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from . import core
 import numpy
 import os
diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py
index b7a92cf044900acdd41ede378dd68aa2d9c6b2dc..63060a77d1abdfd4060648bfabe25709afcfeb8d 100644
--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import sys
+import six
 import re
 from .graphviz import GraphPreviewGenerator
 from .proto import framework_pb2
@@ -225,7 +228,7 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
     graph = GraphPreviewGenerator("some graph")
     # collect parameters and args
     protostr = block.desc.serialize_to_string()
-    desc = framework_pb2.BlockDesc.FromString(str(protostr))
+    desc = framework_pb2.BlockDesc.FromString(six.binary_type(protostr))
 
     def need_highlight(name):
         if highlights is None: return False
diff --git a/python/paddle/fluid/default_scope_funcs.py b/python/paddle/fluid/default_scope_funcs.py
index f8faf6942524612ccc63713240bb289eeeaf75eb..a5b2c84dfe6f2650b4a2ee4465f723812e5d4a01 100644
--- a/python/paddle/fluid/default_scope_funcs.py
+++ b/python/paddle/fluid/default_scope_funcs.py
@@ -26,6 +26,8 @@ A `scoped_function` will take a `function` as input. That function will be
 invoked in a new local scope.
 """
 
+from __future__ import print_function
+
 import paddle.fluid.core
 import threading
 
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index c0671cce9a1f169f02ba03a839c45b6e4df2c47a..7a82038ff78b17b2ddfd7b47320d41a7de9a2b8a 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import warnings
 import numpy as np
 
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index e24b9faae24084ccc743a5b5126db9667089e128..288951cd7cd32155f136125fb817c35dd2ec6444 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import contextlib
 import six
@@ -320,8 +322,9 @@ class Executor(object):
         # append fetch_operators
         if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
             for i, var in enumerate(fetch_list):
-                assert isinstance(var, Variable) or isinstance(var, str), (
-                    "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
+                assert isinstance(var, Variable) or isinstance(
+                    var, six.string_types), (
+                        "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
                 global_block.append_op(
                     type='fetch',
                     inputs={'X': [var]},
@@ -346,7 +349,7 @@ class Executor(object):
     def _fetch_data(self, fetch_list, fetch_var_name, scope):
         outs = [
             core.get_fetch_variable(scope, fetch_var_name, i)
-            for i in range(len(fetch_list))
+            for i in six.moves.range(len(fetch_list))
         ]
         return outs
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 45b3abb88c9431f52705bb62df2c32779dd0cf9d..62682d10324c7cfe656c9ddb09f1b61ac1772e69 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import collections
 import contextlib
 import re
@@ -19,6 +21,7 @@ import six
 
 import numpy as np
 
+from .. import compat as cpt
 from .proto import framework_pb2
 try:
     from . import core
@@ -27,7 +30,7 @@ except ImportError as e:
         """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
     if you encounters \"libmkldnn.so not found\" errors. If you have python
     installed in other directory, replace \"/usr/local/lib\" with your own
-    directory. The original error is: \n""" + e.message)
+    directory. The original error is: \n""" + cpt.get_exception_message(e))
 except Exception as e:
     raise e
 from . import unique_name
@@ -87,7 +90,7 @@ def convert_np_dtype_to_dtype_(np_dtype):
     elif dtype == np.uint8:
         return core.VarDesc.VarType.UINT8
     else:
-        raise ValueError("Not supported numpy dtype " + six.binary_type(dtype))
+        raise ValueError("Not supported numpy dtype %s" % dtype)
 
 
 def dtype_is_floating(dtype):
@@ -198,11 +201,11 @@ class Variable(object):
         if name is None:
             name = unique_name.generate('_generated_var')
         is_new_var = False
-        name = name if isinstance(name, six.binary_type) else name.encode()
-        self.desc = self.block.desc.find_var(name)
+        name = cpt.to_text(name)
+        self.desc = self.block.desc.find_var(cpt.to_bytes(name))
 
         if self.desc is None:
-            self.desc = self.block.desc.var(name)
+            self.desc = self.block.desc.var(cpt.to_bytes(name))
             is_new_var = True
 
         if is_new_var:
@@ -325,7 +328,7 @@ class Variable(object):
 
     @property
     def name(self):
-        return self.desc.name()
+        return cpt.to_text(self.desc.name())
 
     @name.setter
     def name(self, new_name):
@@ -531,14 +534,7 @@ class Operator(object):
                         elif isinstance(arg, six.binary_type):
                             in_arg_names.append(arg.decode())
                         else:
-                            if isinstance(arg.name, six.string_types):
-                                in_arg_names.append(arg.name)
-                            elif isinstance(arg.name, six.binary_type):
-                                in_arg_names.append(arg.name.decode())
-                            else:
-                                raise TypeError(
-                                    "arguments require unicode, str or bytes, but get %s instead."
-                                    % (type(arg.name)))
+                            in_arg_names.append(cpt.to_text(arg.name))
                     self.desc.set_input(in_proto.name, in_arg_names)
                 else:
                     self.desc.set_input(in_proto.name, [])
@@ -567,14 +563,7 @@ class Operator(object):
                         (out_proto.name, len(out_args)))
                 out_arg_names = []
                 for arg in out_args:
-                    if isinstance(arg.name, six.string_types):
-                        out_arg_names.append(arg.name)
-                    elif isinstance(arg.name, six.binary_type):
-                        out_arg_names.append(arg.name.decode())
-                    else:
-                        raise TypeError(
-                            "arguments require unicode, str or bytes, but get %s instead."
-                            % (type(arg.name)))
+                    out_arg_names.append(cpt.to_text(arg.name))
                     arg.op = self
                 self.desc.set_output(out_proto.name, out_arg_names)
 
@@ -970,10 +959,9 @@ class Block(object):
             Variable: the Variable with the giving name.
         """
         if not isinstance(name, six.string_types):
-            if not isinstance(name, six.binary_type):
-                raise TypeError(
-                    "var require string as parameter, but get %s instead." %
-                    (type(name)))
+            raise TypeError(
+                "var require string as parameter, but get %s instead." %
+                (type(name)))
         v = self.vars.get(name, None)
         if v is None:
             raise ValueError("var %s not in this block" % name)
@@ -1024,7 +1012,7 @@ class Block(object):
         return list(self.iter_parameters())
 
     def iter_parameters(self):
-        return (item[1] for item in list(self.vars.items())
+        return (item[1] for item in six.iteritems(self.vars)
                 if isinstance(item[1], Parameter))
 
     def create_var(self, *args, **kwargs):
@@ -1052,6 +1040,9 @@ class Block(object):
         Returns:
             Variable: the Variable with the giving name.
         """
+        name = cpt.to_text(name)
+        new_name = cpt.to_text(new_name)
+
         if not self.has_var(name):
             raise ValueError("var %s is not in current block" % name)
         v = self.var(name)
@@ -1070,9 +1061,9 @@ class Block(object):
         else:
             raise ValueError("unsupported var type: %s", type(v))
         orig_var_type = v.type
-        self.desc._rename_var(name, new_name)
+        self.desc._rename_var(cpt.to_bytes(name), cpt.to_bytes(new_name))
         # NOTE: v is destroyed by C++ after calling _rename_var.
-        d = self.desc.find_var(new_name)
+        d = self.desc.find_var(cpt.to_bytes(new_name))
         if var_type == "Parameter":
             var = Parameter(
                 self,
@@ -1103,7 +1094,7 @@ class Block(object):
 
     def _remove_var(self, name):
         self._sync_with_cpp()
-        self.desc._remove_var(name)
+        self.desc._remove_var(cpt.to_bytes(name))
         del self.vars[name]
 
     def create_parameter(self, *args, **kwargs):
@@ -1205,7 +1196,7 @@ class Block(object):
 
         # sync variables removed from c++ end
         for var in list(self.vars.keys()):
-            if not self.desc.find_var(var):
+            if not self.desc.find_var(cpt.to_bytes(var)):
                 self.vars.pop(var)
 
         # sync operators from cpp
@@ -1372,6 +1363,13 @@ class Program(object):
         self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
         self._op_role_var = []
 
+        # for distribute
+        self._is_distributed = False
+        self._is_chief = False
+        self._slice_vars_and_attrs = []
+        self._endpoints = []
+        self._distributed_lookup_table = None
+
     @property
     def op_role(self):
         """
@@ -1576,7 +1574,9 @@ class Program(object):
             p.current_block_idx = self.current_block_idx
             p._seed = self._seed
             p.desc = core.ProgramDesc(self.desc)
-            p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+            p.blocks = [
+                Block(p, i) for i in six.moves.range(self.desc.num_blocks())
+            ]
 
             p._current_role = self._current_role
             p._op_role_var = self._op_role_var
@@ -1632,7 +1632,9 @@ class Program(object):
             targets_idx.append([t.block.idx, t.idx])
         res = Program()
         res.desc = core.prune(self.desc, targets_idx)
-        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
+        res.blocks = [
+            Block(res, i) for i in six.moves.range(res.desc.num_blocks())
+        ]
         res._sync_with_cpp()
         return res
 
@@ -1675,16 +1677,18 @@ class Program(object):
                 root_block._remove_op(0, read_op_idx + 1)
             for var in root_block.all_vars():
                 if var.type() == core.VarDesc.VarType.READER:
-                    root_block._remove_var(var.name())
+                    root_block._remove_var(cpt.to_bytes(var.name()))
 
         # change all `is_test` attributes to True
-        for i in range(res.desc.num_blocks()):
+        for i in six.moves.range(res.desc.num_blocks()):
             block = res.desc.block(i)
-            for j in range(block.op_size()):
+            for j in six.moves.range(block.op_size()):
                 op = block.op(j)
                 if op.has_attr('is_test'):
                     op.set_attr('is_test', True)
-        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
+        res.blocks = [
+            Block(res, i) for i in six.moves.range(res.desc.num_blocks())
+        ]
         res._sync_with_cpp()
         return res
 
@@ -1704,7 +1708,7 @@ class Program(object):
         """
         p = Program()
         p.desc = core.ProgramDesc(binary_str)
-        p.blocks = [Block(p, i) for i in range(p.desc.num_blocks())]
+        p.blocks = [Block(p, i) for i in six.moves.range(p.desc.num_blocks())]
         p._sync_with_cpp()
         return p
 
diff --git a/python/paddle/fluid/graphviz.py b/python/paddle/fluid/graphviz.py
index ba67bf5ae6fe44ea23414d444a270c436c195326..2b18d854d18bcbebce2a0eb30b8690db49d9d246 100644
--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import os
 import random
 import six
+import functools
 import subprocess
 import logging
 
@@ -105,8 +108,9 @@ class Graph(object):
 
     def _rank_repr(self):
         ranks = sorted(
-            list(self.rank_groups.items()),
-            cmp=lambda a, b: a[1].priority > b[1].priority)
+            six.iteritems(self.rank_groups),
+            key=functools.cmp_to_key(
+                lambda a, b: a[1].priority > b[1].priority))
         repr = []
         for x in ranks:
             repr.append(str(x[1]))
@@ -149,7 +153,7 @@ class Node(object):
             name=self.name,
             label=self.label,
             extra=',' + ','.join("%s=%s" % (key, crepr(value))
-                                 for key, value in list(self.attrs.items()))
+                                 for key, value in six.iteritems(self.attrs))
             if self.attrs else "")
         return reprs
 
@@ -173,7 +177,7 @@ class Edge(object):
             target=self.target.name,
             extra="" if not self.attrs else
             "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
-                           for attr in list(self.attrs.items())) + "]")
+                           for attr in six.iteritems(self.attrs)) + "]")
         return repr
 
 
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index ff382d8b832b4b2bc6779dbb28d3fd95c8a0984e..3d2ef566173f81b29a6d8ea79cff00991a4ef3c4 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import contextlib
 
 from . import core
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 6dedbae7a6586f862328c7f23d0aea6ba5022614..bd46ed8e50c9344d471578eb0f89b7e214d62722 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from . import framework
 import numpy as np
 import contextlib
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index af734210323913a36f861380dc38a98253aca0a1..b3ed094c892c6fce7184d6d98f50ed7d6d1642a3 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import os
 import errno
 import time
@@ -370,6 +372,7 @@ def load_vars(executor,
         load_vars(
             executor,
             dirname=dirname,
+            main_program=main_program,
             vars=list(filter(predicate, main_program.list_vars())),
             filename=filename)
     else:
@@ -401,9 +404,12 @@ def load_vars(executor,
                 inputs={},
                 outputs={"Out": load_var_list},
                 attrs={'file_path': os.path.join(dirname, filename)})
-
         executor.run(load_prog)
 
+        # load slice vars on pserver, if have it.
+        _load_slice_up_vars(executor, dirname,
+                            main_program._slice_vars_and_attrs)
+
 
 def load_params(executor, dirname, main_program=None, filename=None):
     """
@@ -603,25 +609,15 @@ def save_inference_model(dirname,
             # "./infer_model".
 
     """
-    if isinstance(feeded_var_names, six.binary_type):
+    if isinstance(feeded_var_names, six.string_types):
         feeded_var_names = [feeded_var_names]
-    elif isinstance(feeded_var_names, six.text_type):
-        feeded_var_names = [feeded_var_names.encode()]
     else:
         if len(feeded_var_names) > 0:
             # TODO(paddle-dev): polish these code blocks
             if not (bool(feeded_var_names) and all(
-                    isinstance(name, six.binary_type)
+                    isinstance(name, six.string_types)
                     for name in feeded_var_names)):
-                if not (all(
-                        isinstance(name, six.text_type)
-                        for name in feeded_var_names)):
-                    raise ValueError(
-                        "'feed_var_names' should be a list of str.")
-                else:
-                    feeded_var_names = [
-                        name.encode() for name in feeded_var_names
-                    ]
+                raise ValueError("'feed_var_names' should be a list of str.")
 
     if isinstance(target_vars, Variable):
         target_vars = [target_vars]
@@ -667,11 +663,19 @@ def save_inference_model(dirname,
 
     save_persistables(executor, dirname, inference_program, params_filename)
 
+    # if there is lookup table, the trainer 0 will notify all pserver to save.
+    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
+        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+        _save_lookup_tables_by_notify(executor, lookup_table_filename,
+                                      main_program._distributed_lookup_table,
+                                      main_program._endpoints)
+
 
 def load_inference_model(dirname,
                          executor,
                          model_filename=None,
-                         params_filename=None):
+                         params_filename=None,
+                         pserver_endpoints=None):
     """
     Load inference model from a directory
 
@@ -687,6 +691,10 @@ def load_inference_model(dirname,
                                    parameters were saved in a single binary
                                    file. If parameters were saved in separate
                                    files, set it as 'None'.
+        pserver_endpoints(list|None): This only need by distributed inference.
+                                    When use distributed look up table in training,
+                                    We also need it in inference.The parameter is
+                                    a list of pserver endpoints.
 
     Returns:
         tuple: The return of this function is a tuple with three elements:
@@ -705,12 +713,16 @@ def load_inference_model(dirname,
 
             exe = fluid.Executor(fluid.CPUPlace())
             path = "./infer_model"
+            endpoints = ["127.0.0.1:2023","127.0.0.1:2024"]
             [inference_program, feed_target_names, fetch_targets] =
                 fluid.io.load_inference_model(dirname=path, executor=exe)
             results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
 
+            # if we need lookup table, we will use:
+            fluid.io.load_inference_model(dirname=path, executor=exe, pserver_endpoints=endpoints)
+
             # In this exsample, the inference program was saved in the
             # "./infer_model/__model__" and parameters were saved in
             # separate files in ""./infer_model".
@@ -737,6 +749,9 @@ def load_inference_model(dirname,
     program = Program.parse_from_string(program_desc_str)
     load_persistables(executor, dirname, program, params_filename)
 
+    if pserver_endpoints:
+        program = _endpoints_replacement(program, pserver_endpoints)
+
     feed_target_names = program.desc.get_feed_target_names()
     fetch_target_names = program.desc.get_fetch_target_names()
     fetch_targets = [
@@ -746,6 +761,61 @@ def load_inference_model(dirname,
     return [program, feed_target_names, fetch_targets]
 
 
+def _save_lookup_tables_by_notify(executor, dirname, lookup_table,
+                                  pserver_endpoints):
+    """
+    This function will send checkpoint notify message from Trainer 0
+    to all the pservers.
+    The checkpoint notify message contains lookup table name,
+    the absolute path on pserver to save lookup_table.
+
+    Args:
+        executor(Executor): The executor to run for send checkpoint notify.
+        dirname(str): The folder where to save.
+        lookup_table(string): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name
+        ps_endpoint_list(list): the parameter server ip:port list.
+            when use distribute lookup table, we can get ps_endpoint_list by
+            distribute arguments.
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            _save_pserver_vars_by_notify(executor=exe,
+                    dirname=param_path, lookup_table=table_name,
+                    pserver_endpoints=ps_endpoints)
+    """
+
+    pserver_notify_program = Program()
+    pserver_notify_block = pserver_notify_program.global_block()
+
+    attrs = {}
+    attrs['epmap'] = pserver_endpoints
+    attrs['dir'] = dirname
+    attrs['lookup_table'] = lookup_table
+
+    pserver_notify_block.append_op(
+        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+    executor.run(pserver_notify_program)
+
+
+def _endpoints_replacement(program, endpoints):
+    ENDPOINT_MAP = "epmap"
+    for op in program.global_block().ops:
+        if op.has_attr(ENDPOINT_MAP):
+            op.set_attr(ENDPOINT_MAP, endpoints)
+    program._sync_with_cpp()
+    return program
+
+
 def get_parameter_value(para, executor):
     """
     Get the LoDTensor value of the given parameter.
@@ -807,3 +877,46 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
+
+
+def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
+    if not slice_vars_and_attrs:
+        return
+
+    load_prog = Program()
+    load_block = load_prog.global_block()
+
+    for var_tuple in slice_vars_and_attrs:
+        orig_var = var_tuple[0]
+        start = var_tuple[1]
+        slice_var = var_tuple[2]
+        end = start + reduce(lambda x, y: x * y, slice_var.shape)
+
+        clone_orig_var = load_block.create_var(
+            name=orig_var.name,
+            type=orig_var.type,
+            shape=orig_var.shape,
+            dtype=orig_var.dtype,
+            persistable=True)
+
+        clone_slice_var = load_block.create_var(
+            name=slice_var.name,
+            type=slice_var.type,
+            shape=slice_var.shape,
+            dtype=slice_var.dtype,
+            persistable=True)
+
+        load_block.append_op(
+            type='load',
+            inputs={},
+            outputs={'Out': [clone_orig_var]},
+            attrs={'file_path': os.path.join(dirname, clone_orig_var.name)})
+        load_block.append_op(
+            type="slice",
+            inputs={'Input': clone_orig_var},
+            outputs={'Out': clone_slice_var},
+            attrs={'axes': [0],
+                   'starts': [start],
+                   'ends': [end]})
+
+    executor.run(load_prog)
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 0c2b1eb795860373220eb254612161f7dc816ffd..bd9727b6ac0208b199091db00bd0fd5fae74d53b 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import copy
 import itertools
 import six
@@ -85,7 +87,7 @@ class LayerHelper(object):
             raise ValueError("parameter number mismatch")
         elif len(param_attr) == 1 and length != 1:
             tmp = [None] * length
-            for i in range(length):
+            for i in six.moves.range(length):
                 tmp[i] = copy.deepcopy(param_attr[0])
             param_attr = tmp
         return param_attr
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
index a48e360463456ab7e00534dc0684aa153c8205cd..a2a808777ddc499570eb9ef92175787a14cf77ca 100644
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from . import ops
 from .ops import *
 from . import nn
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 9fb7b4d0cad67db2d2d4b56e43d8837b8160cdb0..8bfe11916bd069cd2dd7016c03644d6cad1e188d 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import contextlib
 
 from .layer_function_generator import autodoc, templatedoc
@@ -22,6 +24,7 @@ from ..initializer import force_init_on_cpu
 from .ops import logical_and, logical_not, logical_or
 import numpy
 import warnings
+import six
 from functools import reduce
 
 __all__ = [
@@ -602,7 +605,7 @@ class StaticRNN(object):
         boot_memories = []
         pre_memories = []
         memories = []
-        for _, mem in list(self.memories.items()):
+        for _, mem in six.iteritems(self.memories):
             boot_memories.append(mem.init)
             pre_memories.append(mem.pre_mem.name)
             mem_var = rnn_block.var(mem.mem.name)
@@ -1269,8 +1272,8 @@ class ConditionalBlock(object):
         parent_block.append_op(
             type='conditional_block',
             inputs={
-                'X': self.inputs,
-                'Params': param_list,
+                'Cond': self.inputs,
+                'Input': param_list,
             },
             outputs={'Out': out_list,
                      'Scope': [step_scope]},
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index b996c8368862184f9bc8b177f3b6e43aebdfb007..72071478845df444ce72ce946787b2d0ce5f0d23 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -15,13 +15,17 @@
 All layers just related to the detection neural network.
 """
 
+from __future__ import print_function
+
 from .layer_function_generator import generate_layer_fn
 from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
 from . import tensor
 from . import nn
 from . import ops
+from ... import compat as cpt
 import math
+import six
 import numpy
 from functools import reduce
 
@@ -104,7 +108,7 @@ def rpn_target_assign(loc,
             examples.
 
     Returns:
-        tuple: 
+        tuple:
                A tuple(predicted_scores, predicted_location, target_label,
                target_bbox) is returned. The predicted_scores and
                predicted_location is the predicted result of the RPN.
@@ -115,7 +119,7 @@ def rpn_target_assign(loc,
                anchors. The predicted_scores is a 2D Tensor with shape
                [F + B, 1], and the shape of target_label is same as the shape
                of the predicted_scores, B is the number of the background
-               anchors, the F and B is depends on the input of this operator. 
+               anchors, the F and B is depends on the input of this operator.
 
     Examples:
         .. code-block:: python
@@ -232,8 +236,8 @@ def detection_output(loc,
         nms_eta(float): The parameter for adaptive NMS.
 
     Returns:
-        Variable: 
-        
+        Variable:
+
             The detection outputs is a LoDTensor with shape [No, 6].
             Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
             `No` is the total number of detections in this mini-batch. For each
@@ -504,7 +508,7 @@ def target_assign(input,
 
     Assumed that the row offset for each instance in `neg_indices` is called neg_lod,
     for i-th instance and each `id` of neg_indices in this instance:
-    
+
     .. code-block:: text
 
         out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
@@ -522,11 +526,11 @@ def target_assign(input,
        mismatch_value (float32): Fill this value to the mismatched location.
 
     Returns:
-        tuple: 
-               A tuple(out, out_weight) is returned. out is a 3D Tensor with 
-               shape [N, P, K], N and P is the same as they are in 
-               `neg_indices`, K is the same as it in input of X. If 
-               `match_indices[i][j]`. out_weight is the weight for output with 
+        tuple:
+               A tuple(out, out_weight) is returned. out is a 3D Tensor with
+               shape [N, P, K], N and P is the same as they are in
+               `neg_indices`, K is the same as it in input of X. If
+               `match_indices[i][j]`. out_weight is the weight for output with
                the shape of [N, P, 1].
 
     Examples:
@@ -834,7 +838,7 @@ def prior_box(input,
        offset(float): Prior boxes center offset. Default: 0.5
        name(str): Name of the prior box op. Default: None.
        min_max_aspect_ratios_order(bool): If set True, the output prior box is
-            in order of [min, max, aspect_ratios], which is consistent with 
+            in order of [min, max, aspect_ratios], which is consistent with
             Caffe. Please note, this order affects the weights order of
             convolution layer followed by and does not affect the final
             detection results. Default: False.
@@ -977,7 +981,7 @@ def multi_box_head(inputs,
        stride(int|list|tuple): The stride of conv2d. Default:1,
        name(str): Name of the prior box layer. Default: None.
        min_max_aspect_ratios_order(bool): If set True, the output prior box is
-            in order of [min, max, aspect_ratios], which is consistent with 
+            in order of [min, max, aspect_ratios], which is consistent with
             Caffe. Please note, this order affects the weights order of
             convolution layer followed by and does not affect the fininal
             detection results. Default: False.
@@ -1039,7 +1043,7 @@ def multi_box_head(inputs,
         min_sizes = []
         max_sizes = []
         step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
-        for ratio in range(min_ratio, max_ratio + 1, step):
+        for ratio in six.moves.range(min_ratio, max_ratio + 1, step):
             min_sizes.append(base_size * ratio / 100.)
             max_sizes.append(base_size * (ratio + step) / 100.)
         min_sizes = [base_size * .10] + min_sizes
@@ -1108,8 +1112,8 @@ def multi_box_head(inputs,
 
         mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1])
         compile_shape = [
-            mbox_loc.shape[0],
-            mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
+            mbox_loc.shape[0], cpt.floor_division(
+                mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3], 4), 4
         ]
         run_shape = tensor.assign(numpy.array([0, -1, 4]).astype("int32"))
         mbox_loc_flatten = nn.reshape(
@@ -1127,8 +1131,9 @@ def multi_box_head(inputs,
         conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1])
         new_shape = [0, -1, num_classes]
         compile_shape = [
-            conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
-            conf_loc.shape[3] / num_classes, num_classes
+            conf_loc.shape[0],
+            cpt.floor_division(conf_loc.shape[1] * conf_loc.shape[2] *
+                               conf_loc.shape[3], num_classes), num_classes
         ]
         run_shape = tensor.assign(
             numpy.array([0, -1, num_classes]).astype("int32"))
diff --git a/python/paddle/fluid/layers/device.py b/python/paddle/fluid/layers/device.py
index bb1fb7fd571a56acf367e663af0cf9431211bcea..43ebd160de3fd3d2a491a3ec1fbe0e4085fbd0b1 100644
--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -15,6 +15,8 @@
 All util layers.
 """
 
+from __future__ import print_function
+
 from .layer_function_generator import autodoc
 from ..framework import unique_name
 from ..layer_helper import LayerHelper
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 327ae309816344a0bcebfe70ffb59a00eab1d86f..b03ee514f50f9a8c1425bd5b1d409b58ed62351a 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -11,8 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import contextlib
 import multiprocessing
+import six
 import threading
 
 from ..data_feeder import DataFeeder
@@ -21,7 +24,7 @@ from .layer_function_generator import templatedoc
 from .. import core
 from ..executor import global_scope
 from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
-    default_startup_program, program_guard, Program
+    default_startup_program, program_guard, Program, Variable
 from ..layer_helper import LayerHelper
 from ..unique_name import generate as unique_name
 
@@ -69,7 +72,7 @@ def data(name,
     """
     helper = LayerHelper('data', **locals())
     shape = list(shape)
-    for i in range(len(shape)):
+    for i in six.moves.range(len(shape)):
         if shape[i] is None:
             shape[i] = -1
             append_batch_size = False
@@ -206,7 +209,7 @@ class ListenAndServ(object):
             })
 
 
-def Send(endpoints, send_vars, sync=True):
+def Send(endpoints, send_vars, dummy_output=None, sync=True):
     """
     Send variables to the server side, and get vars from server
     side when server have finished running server side program.
@@ -220,6 +223,13 @@ def Send(endpoints, send_vars, sync=True):
     """
     assert (type(send_vars) == list)
 
+    if dummy_output is None:
+        dummy_output = []
+    elif isinstance(dummy_output, Variable):
+        dummy_output = [dummy_output]
+
+    assert (type(dummy_output) == list)
+
     epmap = endpoints.split(",")
     endpoints = list(set(epmap))
 
@@ -229,6 +239,7 @@ def Send(endpoints, send_vars, sync=True):
     helper.append_op(
         type="send",
         inputs={"X": send_vars},
+        outputs={"Out": dummy_output},
         attrs={
             "endpoints": endpoints,
             "epmap": epmap,
@@ -238,7 +249,7 @@ def Send(endpoints, send_vars, sync=True):
         helper.append_op(type="send_barrier", attrs={"endpoints": endpoints})
 
 
-def Recv(endpoints, get_vars, sync=True):
+def Recv(endpoints, get_vars, dummy_input=None, sync=True):
     """
     Receive variables from server side
 
@@ -253,13 +264,20 @@ def Recv(endpoints, get_vars, sync=True):
     """
     assert (type(get_vars) == list)
 
+    if dummy_input is None:
+        dummy_input = []
+    elif isinstance(dummy_input, Variable):
+        dummy_input = [dummy_input]
+
+    assert (type(dummy_input) == list)
+
     epmap = endpoints.split(",")
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Recv", **locals())
     helper.append_op(
         type="recv",
-        inputs={"X": get_vars},
+        inputs={"X": dummy_input},
         outputs={"Out": get_vars},
         attrs={"endpoints": endpoints,
                "epmap": epmap})
@@ -674,7 +692,7 @@ def py_reader(capacity,
 
         def __tensor_provider__():
             for slots in paddle_reader():
-                yield [slots[str(idx)] for idx in xrange(counter)]
+                yield [slots[str(idx)] for idx in six.moves.xrange(counter)]
 
         __set_tensor_provider__(__tensor_provider__)
 
@@ -750,7 +768,7 @@ def open_files(filenames,
     else:
         buffer_size = int(buffer_size)
 
-    if isinstance(filenames, basestring):
+    if isinstance(filenames, six.string_types):
         filenames = [filenames]
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
@@ -1005,7 +1023,7 @@ class Preprocessor(object):
         source_lod_levels = self.underlying_reader.desc.lod_levels()
         self.source_var_names = [
             unique_name("preprocessor_source")
-            for _ in range(len(source_shapes))
+            for _ in six.moves.range(len(source_shapes))
         ]
         source_vars = []
         for var_name, shape, dtype, lod_level in zip(
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index c0d72620b1ddb183f43ebce766688518b5a737ac..8963d74de014d69c590276d5ff7080111f614230 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import re
 import functools
 import warnings
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index daf91a40f7ad7935d355a287819ad1dbcdd84eb8..be368007dd7061ba7fc97414dbadfce00d158776 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -20,6 +20,8 @@ User can also implement their own learning_rate_decay
 strategy according to this module.
 """
 
+from __future__ import print_function
+
 from . import control_flow
 from . import nn
 from . import ops
@@ -72,10 +74,10 @@ def noam_decay(d_model, warmup_steps):
 
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
-    Applies exponential decay to the learning rate. 
+    Applies exponential decay to the learning rate.
 
-    When training a model, it is often recommended to lower the learning rate as the 
-    training progresses. By using this function, the learning rate will be decayed by 
+    When training a model, it is often recommended to lower the learning rate as the
+    training progresses. By using this function, the learning rate will be decayed by
     'decay_rate' every 'decay_steps' steps.
 
     >>> if staircase == True:
@@ -148,8 +150,8 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
     Applies inverse time decay to the initial learning rate.
 
-    When training a model, it is often recommended to lower the learning rate as the 
-    training progresses. By using this function, an inverse decay function will be 
+    When training a model, it is often recommended to lower the learning rate as the
+    training progresses. By using this function, an inverse decay function will be
     applied to the initial learning rate.
 
     >>> if staircase == True:
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 0e10a91d25877984396f9bcf9aae6438707eeab1..a458cebfb194a068d040a8919fd4abcb4b4bea80 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from ..framework import Variable, unique_name
 from .layer_function_generator import OpProtoHolder
 from ..initializer import force_init_on_cpu
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 49bae1e8af768d93294120e1d13ef0242313aa3c..2c3bdd77e1fa1c86baa3a288caab4ad4324e2ef2 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -15,6 +15,8 @@
 All layers just related to metric.
 """
 
+from __future__ import print_function
+
 import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
@@ -81,9 +83,9 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
     **Area Under the Curve (AUC) Layer**
 
     This implementation computes the AUC according to forward output and label.
-    It is used very widely in binary classification evaluation. 
+    It is used very widely in binary classification evaluation.
 
-    Note: If input label contains values other than 0 and 1, it will be cast 
+    Note: If input label contains values other than 0 and 1, it will be cast
     to `bool`. Find the relevant definitions `here <https://en.wikipedia.org\
     /wiki/Receiver_operating_characteristic#Area_under_the_curve>`_.
 
@@ -93,14 +95,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
         2. PR: Precision Recall
 
     Args:
-        input(Variable): A floating-point 2D Variable, values are in the range 
-                         [0, 1]. Each row is sorted in descending order. This 
-                         input should be the output of topk. Typically, this 
+        input(Variable): A floating-point 2D Variable, values are in the range
+                         [0, 1]. Each row is sorted in descending order. This
+                         input should be the output of topk. Typically, this
                          Variable indicates the probability of each label.
-        label(Variable): A 2D int Variable indicating the label of the training 
+        label(Variable): A 2D int Variable indicating the label of the training
                          data. The height is batch size and width is always 1.
         curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'.
-        num_thresholds(int): The number of thresholds to use when discretizing 
+        num_thresholds(int): The number of thresholds to use when discretizing
                              the roc curve. Default 200.
         topk(int): only topk number of prediction output will be used for auc.
 
@@ -109,7 +111,7 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
 
     Examples:
         .. code-block:: python
-        
+
             # network is a binary classification model and label the ground truth
             prediction = network(image, is_infer=True)
             auc_out=fluid.layers.auc(input=prediction, label=label)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index be852b67119182cc817495b5e993c872cb9a88bf..71592618f540a8f42d9a25dd8a1af5e67a592f21 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -11,24 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-#   Copyright (c ) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """
 All layers just related to the neural network.
 """
 
+from __future__ import print_function
+
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
@@ -106,6 +94,7 @@ __all__ = [
     'image_resize_short',
     'resize_bilinear',
     'gather',
+    'scatter',
     'random_crop',
     'mean_iou',
     'relu',
@@ -362,7 +351,7 @@ def dynamic_lstm(input,
     """
 
     helper = LayerHelper('lstm', **locals())
-    size = size / 4
+    size = size // 4
     weight = helper.create_parameter(
         attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
     bias_size = [1, 7 * size]
@@ -552,7 +541,7 @@ def dynamic_lstmp(input,
     """
 
     helper = LayerHelper('lstmp', **locals())
-    size = size / 4
+    size = size // 4
     weight = helper.create_parameter(
         attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
     proj_weight = helper.create_parameter(
@@ -780,7 +769,7 @@ def gru_unit(input,
 
     helper = LayerHelper('gru_unit', **locals())
     dtype = helper.input_dtype()
-    size = size / 3
+    size = size // 3
 
     # create weight
     weight = helper.create_parameter(
@@ -1264,7 +1253,7 @@ def sequence_conv(input,
         outputs={"Out": pre_bias},
         attrs={
             'contextStride': filter_stride,
-            'contextStart': -int(filter_size / 2),
+            'contextStart': -int(filter_size // 2),
             'contextLength': filter_size
         })
     pre_act = helper.append_bias_op(pre_bias)
@@ -1320,15 +1309,15 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
 
 def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     """
-    The input of the softmax operator is a tensor of any rank. The output tensor 
+    The input of the softmax operator is a tensor of any rank. The output tensor
     has the same shape as the input.
 
-    The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
-    second dimension(row length) is as same as the last dimension of the input 
-    tensor, and the first dimension(column length) is the product of all other 
-    dimensions of the input tensor. For each row of the matrix, the softmax operator 
-    squashes the K-dimensional(K is the width of the matrix, which is also the size 
-    of the input tensor's last dimension) vector of arbitrary real values to a 
+    The input tensor will first be logically flattened to a 2-D matrix. The matrix's
+    second dimension(row length) is as same as the last dimension of the input
+    tensor, and the first dimension(column length) is the product of all other
+    dimensions of the input tensor. For each row of the matrix, the softmax operator
+    squashes the K-dimensional(K is the width of the matrix, which is also the size
+    of the input tensor's last dimension) vector of arbitrary real values to a
     K-dimensional vector of real values in the range [0, 1] that add up to 1.
 
     It computes the exponential of the given dimension and the sum of exponential
@@ -1496,7 +1485,7 @@ def conv2d(input,
     else:
         if num_channels % groups != 0:
             raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels / groups
+        num_filter_channels = num_channels // groups
 
     filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
     stride = utils.convert_to_list(stride, 2, 'stride')
@@ -1507,7 +1496,7 @@ def conv2d(input,
         raise ValueError("use_cudnn should be True or False")
 
     input_shape = input.shape
-    filter_shape = [num_filters, num_filter_channels] + filter_size
+    filter_shape = [num_filters, int(num_filter_channels)] + filter_size
 
     def _get_default_param_initializer():
         std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
@@ -1658,7 +1647,7 @@ def conv3d(input,
     else:
         if num_channels % groups != 0:
             raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels / groups
+        num_filter_channels = num_channels // groups
 
     filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
     stride = utils.convert_to_list(stride, 3, 'stride')
@@ -2393,16 +2382,16 @@ def conv2d_transpose(input,
         w_in = input.shape[3]
 
         filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
-                         padding[0] - 1) / dilation[0] + 1
+                         padding[0] - 1) // dilation[0] + 1
         filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
-                         padding[1] - 1) / dilation[1] + 1
+                         padding[1] - 1) // dilation[1] + 1
         filter_size = [filter_size_h, filter_size_w]
     else:
         filter_size = utils.convert_to_list(filter_size, 2,
                                             'conv2d_transpose.filter_size')
 
     groups = 1 if groups is None else groups
-    filter_shape = [input_channel, num_filters / groups] + filter_size
+    filter_shape = [input_channel, num_filters // groups] + filter_size
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
@@ -2560,18 +2549,18 @@ def conv3d_transpose(input,
         w_in = input.shape[4]
 
         filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + 2 *
-                         padding[0] - 1) / dilation[0] + 1
+                         padding[0] - 1) // dilation[0] + 1
         filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + 2 *
-                         padding[1] - 1) / dilation[1] + 1
+                         padding[1] - 1) // dilation[1] + 1
         filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + 2 *
-                         padding[2] - 1) / dilation[2] + 1
+                         padding[2] - 1) // dilation[2] + 1
         filter_size = [filter_size_d, filter_size_h, filter_size_w]
     else:
         filter_size = utils.convert_to_list(filter_size, 3,
                                             'conv3d_transpose.filter_size')
 
     groups = 1 if groups is None else groups
-    filter_shape = [input_channel, num_filters / groups] + filter_size
+    filter_shape = [input_channel, num_filters // groups] + filter_size
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
@@ -2678,15 +2667,15 @@ def beam_search(pre_ids,
 
     Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
     for more details.
-    
-    This layer does the search in beams for one time step. Specifically, it 
+
+    This layer does the search in beams for one time step. Specifically, it
     selects the top-K candidate word ids of current step from :attr:`ids`
     according to their :attr:`scores` for all source sentences, where K is
     :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
     computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
     the output of beam_search at previous step, they are needed for special use
     to handle ended candidate translations.
- 
+
     Note that the :attr:`scores` passed in should be accumulated scores, and
     length penalty should be done with extra operators before calculating the
     accumulated scores if needed, also suggest finding top-K before it and
@@ -3887,7 +3876,7 @@ def nce(input,
 def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
     """
     The hierarchical sigmoid operator is used to accelerate the training
-    process of language model. This operator organizes the classes into a 
+    process of language model. This operator organizes the classes into a
     complete binary tree, each leaf node represents a class(a word) and each
     internal node acts as a binary classifier. For each word there's a unique
     path from root to it's leaf node, hsigmoid calculate the cost for each
@@ -3897,9 +3886,9 @@ def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
 
     Refer to `Hierarchical Probabilistic Neural Network Language Model
     <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
-    
+
     Args:
-        input (Variable): The input tensor variable with shape 
+        input (Variable): The input tensor variable with shape
             :math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
             and :math:`D` is the feature size.
         label (Variable): The tensor variable contains labels of training data.
@@ -3907,7 +3896,7 @@ def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
         num_classes: (int), The number of classes, must not be less than 2.
         param_attr (ParamAttr|list of ParamAttr, default None): The parameter
              attribute for learnable parameters/weights of this layer.
-        bias_attr (ParamAttr|list of ParamAttr, default None):  The parameter 
+        bias_attr (ParamAttr|list of ParamAttr, default None):  The parameter
              attribute for the bias of this layer. If it is set to False, no
              bias will be applied.
 
@@ -5048,6 +5037,47 @@ def gather(input, index):
     return out
 
 
+def scatter(input, index, updates, name=None):
+    """
+    **Scatter Layer**
+
+    Output is obtained by updating the input on selected indices on the first
+    axis.
+
+    .. math::
+
+        Out = X
+        Out[Ids] = Updates
+
+    Args:
+        input (Variable): The source input with rank>=1.
+        index (Variable): The index input with rank=1. Its dtype should be
+                          int32 or int64 as it is used as indexes.
+        updates (Variable): The updated value of scatter op.
+        name (str|None): The output variable name. Default None.
+
+    Returns:
+        output (Variable): The output is a tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.scatter(input, index, updates)
+
+    """
+    helper = LayerHelper('scatter', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="scatter",
+        inputs={"X": input,
+                "Ids": index,
+                "Updates": updates},
+        outputs={"Out": out})
+    return out
+
+
 @templatedoc()
 def random_crop(x, shape, seed=None):
     """
@@ -5306,23 +5336,23 @@ def rank_loss(label, left, right, name=None):
     is a pairwise ranking model with a training sample consisting of a pair
     of documents, A and B. Label P indicates whether A is ranked higher than B
     or not:
- 
+
     P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information
     about the rank of the input pair.
-    
+
     Rank loss layer takes three inputs: left (o_i), right (o_j) and
     label (P_{i,j}). The inputs respectively represent RankNet's output scores
     for documents A and B and the value of label P. The following equation
     computes rank loss C_{i,j} from the inputs:
-    
+
     $$
       C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
       o_{i,j} =  o_i - o_j  \\
       \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
     $$
-    
-    Rank loss layer takes batch inputs with size batch_size (batch_size >= 1).   
- 
+
+    Rank loss layer takes batch inputs with size batch_size (batch_size >= 1).
+
     Args:
         label (Variable): Indicats whether A ranked higher than B or not.
         left (Variable): RankNet's output score for doc A.
@@ -5435,7 +5465,7 @@ def flatten(x, axis=1, name=None):
         axis = 2
       We get:
         Out.shape = (3 * 100, 4 * 100)
-    
+
     Case 2:
       Given
         X.shape = (3, 100, 100, 4)
@@ -5446,8 +5476,8 @@ def flatten(x, axis=1, name=None):
 
     Args:
         x (Variable): A tensor of rank >= axis.
-        axis (int): Indicate up to which input dimensions (exclusive) should 
-                    be flattened to the outer dimension of the output. 
+        axis (int): Indicate up to which input dimensions (exclusive) should
+                    be flattened to the outer dimension of the output.
                     The value for axis must be in the range [0, R], where R
                     is the rank of the input tensor. When axis = 0, the shape
                     of the output tensor is (1, (d_0 X d_1 ... d_n), where the
@@ -5463,7 +5493,7 @@ def flatten(x, axis=1, name=None):
 
     Raises:
         ValueError: If x is not a variable.
-        ValueError: If axis is not in range [0, rank(x)]. 
+        ValueError: If axis is not in range [0, rank(x)].
 
     Examples:
 
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index f70c7f2258ce588444cf46d6c8affc4c9555203e..7cd62efda8900c830f43d882a41ab03184ebe594 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 from .layer_function_generator import generate_layer_fn
 
 __activations__ = [
@@ -63,7 +65,6 @@ __all__ = [
     'uniform_random_batch_size_like',
     'gaussian_random',
     'gaussian_random_batch_size_like',
-    'scatter',
     'sum',
     'slice',
     'shape',
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index b93d721c12cb6ead044dc790f2f2af8a61a63b60..04e71497aa762e390c4123c0bf3d7f111a772dd4 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index 49ec3088831dff415e042e1b0a632f63106eb07b..5688f04ab2382f5731e69c60225765a2094bba8c 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import numpy as np
 
 
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index 53c33616f55be5f5ef7068a6e94418e17d739e3c..a9de09f31f4ed04ba1aa003e85b25fc5a91557e4 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from . import core
 import numpy as np
 
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index cd8934522755691217a99a2cca271badda55368e..592cb23eb9319658f8542ed5bc6ab3e95cfdb118 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -14,11 +14,15 @@
 """
 Fluid Metrics
 
-The metrics are accomplished via Python natively. 
+The metrics are accomplished via Python natively.
 """
+
+from __future__ import print_function
+
 import numpy as np
 import copy
 import warnings
+import six
 
 __all__ = [
     'MetricBase',
@@ -79,10 +83,10 @@ class MetricBase(object):
         """
         states = {
             attr: value
-            for attr, value in list(self.__dict__.items())
+            for attr, value in six.iteritems(self.__dict__)
             if not attr.startswith("_")
         }
-        for attr, value in list(states.items()):
+        for attr, value in six.iteritems(states):
             if isinstance(value, int):
                 setattr(self, attr, 0)
             elif isinstance(value, float):
@@ -105,7 +109,7 @@ class MetricBase(object):
         """
         states = {
             attr: value
-            for attr, value in list(self.__dict__.items())
+            for attr, value in six.iteritems(self.__dict__)
             if not attr.startswith("_")
         }
         config = {}
@@ -141,10 +145,10 @@ class CompositeMetric(MetricBase):
     """
     Composite multiple metrics in one instance.
     for example, merge F1, accuracy, recall into one Metric.
-    
+
     Examples:
         .. code-block:: python
-    
+
           labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
           data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
           pred = fluid.layers.fc(input=data, size=1000, act="tanh")
diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
index 623a7d3fd05567a26bb6923550f597a0e1e27e32..0b61c23d07e95acf7b4564753f748e7fb497e73e 100644
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import argparse
 import json
 import logging
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 08480671d8a5c50bbec97930c451cbcdc241e1fe..051fe84364639ca6028326c0cb02b204a02531af 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -11,6 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
+import six
 from . import layers
 
 __all__ = [
@@ -210,7 +213,7 @@ def img_conv_group(input,
     conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
     conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
 
-    for i in range(len(conv_num_filter)):
+    for i in six.moves.range(len(conv_num_filter)):
         local_conv_act = conv_act
         if conv_with_batchnorm[i]:
             local_conv_act = None
diff --git a/python/paddle/fluid/op.py b/python/paddle/fluid/op.py
index 93f021a360ac61f64e769d057df188d79f6f2bb6..667db10d3ebdd24ddd9efbe2310ebb331e268ee2 100644
--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
+import numpy as np
 import six
 
 import paddle.fluid.core as core
@@ -99,6 +102,8 @@ class OpDescCreationMethod(object):
                 new_attr = op_desc.attrs.add()
                 new_attr.name = attr.name
                 new_attr.type = attr.type
+                if isinstance(user_defined_attr, np.ndarray):
+                    user_defined_attr = user_defined_attr.tolist()
                 if attr.type == framework_pb2.INT:
                     new_attr.i = user_defined_attr
                 elif attr.type == framework_pb2.FLOAT:
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index a07325f46a2892222c2d1dcd74aa7cb01f6760a1..031ddd09a0b27b050b6ac651e4d8c46854092b2f 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import re
 from collections import defaultdict
 from paddle.fluid.framework import Program, Variable
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 2a3555ebdde4d54f63bb420218896560c1b40ffd..a7765c9591f0bd653c08036c46a36131906a758f 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -17,8 +17,10 @@ import multiprocessing
 from . import core
 from . import framework
 from . import executor
+from .. import compat as cpt
 import warnings
 import sys
+import six
 import os
 
 __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']
@@ -95,7 +97,7 @@ class ParallelExecutor(object):
         self._places = []
         self._act_places = []
         if use_cuda:
-            for i in range(core.get_cuda_device_count()):
+            for i in six.moves.range(core.get_cuda_device_count()):
                 p = core.Place()
                 self._act_places.append(core.CUDAPlace(i))
                 p.set_place(self._act_places[-1])
@@ -103,7 +105,7 @@ class ParallelExecutor(object):
         else:
             cpu_num = int(
                 os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            for i in range(cpu_num):
+            for i in six.moves.range(cpu_num):
                 p = core.Place()
                 self._act_places.append(core.CPUPlace())
                 p.set_place(self._act_places[-1])
@@ -153,11 +155,13 @@ class ParallelExecutor(object):
         self.executor = core.ParallelExecutor(
             self._places,
             set([
-                p.name for p in main.global_block().iter_parameters()
+                cpt.to_text(p.name)
+                for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
             ]),
-            set(self.persistable_vars), main.desc, loss_name
-            if loss_name else '', scope, local_scopes, exec_strategy,
+            set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
+            cpt.to_text(loss_name)
+            if loss_name else six.u(''), scope, local_scopes, exec_strategy,
             build_strategy, num_trainers, trainer_id)
         self.scope = scope
 
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index afae577656c8970338f3b02208fcb4c738628ab6..f0be794327f51cbbc4202b8b7b401b712b6d66a3 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import six
 
 from .initializer import Initializer, Xavier, Constant
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 01983a830351b018770e6358f604781ffaae5800..e05885f5f5bfc169828c1c6e723dffff098c3c2e 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from . import core
 from contextlib import contextmanager
 import os
+import six
 
 __all__ = [
     'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
@@ -88,7 +91,7 @@ def cuda_profiler(output_file, output_mode=None, config=None):
     config = NVPROF_CONFIG if config is None else config
     config_file = 'nvprof_config_file'
     with open(config_file, 'wb') as fp:
-        fp.writelines(["%s\n" % item for item in config])
+        fp.writelines([six.b("%s\n" % item) for item in config])
     core.nvprof_init(output_file, output_mode, config_file)
     # Enables profiler collection by the active CUDA profiling tool.
     core.nvprof_start()
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index 93b38ad3fa37bd4bff04c529cd5518a8138e55ea..a69c0c29d4675d3e6b9b2a2d766b8be9935092cf 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import os
 import contextlib
 from . import core
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 6eaac4432d4df1288f37607a01484434542f1138..da38626111a6767e1a76a35d6d1375ccc1283de4 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from . import framework
 from . import core
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index 36a1a223cfd7c69aff3e8648da990d23e4e75202..f6017a455df7e8bd197ef2563a759f843b5e7c73 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import contextlib
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
index 9e4c384d92943227c2d68da829e6019e649a35fb..48c0f3d3611547308b5d4460748d3aab765f5805 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
@@ -28,12 +28,14 @@ images per class.
 
 """
 
+from __future__ import print_function
+
 import itertools
 import numpy
-import paddle.v2.dataset.common
+import paddle.dataset.common
 import tarfile
+import six
 from six.moves import cPickle as pickle
-from six.moves import zip
 
 __all__ = ['train10']
 
@@ -44,20 +46,25 @@ CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
 
 def reader_creator(filename, sub_name, batch_size=None):
     def read_batch(batch):
-        data = batch['data']
-        labels = batch.get('labels', batch.get('fine_labels', None))
+        data = batch[six.b('data')]
+        labels = batch.get(
+            six.b('labels'), batch.get(six.b('fine_labels'), None))
         assert labels is not None
-        for sample, label in zip(data, labels):
+        for sample, label in six.moves.zip(data, labels):
             yield (sample / 255.0).astype(numpy.float32), int(label)
 
     def reader():
         with tarfile.open(filename, mode='r') as f:
-            names = (each_item.name for each_item in f
-                     if sub_name in each_item.name)
+            names = [
+                each_item.name for each_item in f if sub_name in each_item.name
+            ]
 
             batch_count = 0
             for name in names:
-                batch = pickle.load(f.extractfile(name))
+                if six.PY2:
+                    batch = pickle.load(f.extractfile(name))
+                else:
+                    batch = pickle.load(f.extractfile(name), encoding='bytes')
                 for item in read_batch(batch):
                     if isinstance(batch_size, int) and batch_count > batch_size:
                         break
@@ -78,6 +85,6 @@ def train10(batch_size=None):
     :rtype: callable
     """
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
         'data_batch',
         batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index a1f62db093904b617f0e37dc20d586ccea7eacd2..be494a0d340c62fb35afbf97fba38eff08a965e6 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import numpy
@@ -55,7 +57,7 @@ def resnet_cifar10(input, depth=32):
         return tmp
 
     assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
+    n = (depth - 2) // 6
     conv1 = conv_bn_layer(
         input=input, ch_out=16, filter_size=3, stride=1, padding=1)
     res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index 8429551765740e7db0eda82ce0b17cff129359b0..dbc7bc06c93157f271c79e85b6925468e861e57f 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import numpy
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
index e3602e2d5643c233b2575d1adb7f181127f60287..ec4e1c768c7f2a2421ac409a2eecc0100c086a6a 100755
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import numpy as np
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index 6fb0c85a8be2b4560ea1fdb32f01146a9206ee78..560f1189581f631dc6a3470cf8f22f902ca26f26 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import contextlib
 
 import numpy as np
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index 898807db6f343cbefcc877e0f03ed6c5b82dd669..187bef1b0c1a614fbca88ef22097831d7bd5cd7f 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import argparse
 import paddle.fluid as fluid
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index 6dd64be315159f1835244fa027e578434e6cb038..b95e7db122adbb1414da1691926c920b963fd6fe 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import argparse
 import paddle.fluid as fluid
 import paddle
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
index 60f3d8e105209938360487d963b0328d95e7b1f0..9e2767783bb6748cfc8f95567627068d7532a8c8 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import math
 import sys
 import numpy as np
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
index 24e65d1bd54cff7ad64453a3a61f50351d32ef08..097c2a468fca558106aba2f24c332256189d9076 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 from functools import partial
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
index b3b1505a0fad07144f3f53c22abd5553054d8c51..5f74cd142590abb93f8846bc831a9f5e3dd2f311 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 from functools import partial
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 25f99ff0fd2d1050bb62338a6bf87aa29f913fb6..284a6ca168636377699c287236c491352566909b 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 from functools import partial
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index 02e65cf56c4d1bd262831320befd2edc735c0d1c..1c7cf3199a07c3f65d967eda70a481b1bd1b1638 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import numpy as np
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index ce6342c2dad0b33e57d0ea90fc6ef1660ae4e68b..82f1c6615f3c4ca54bf5e979b55082022cd4da9f 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from paddle.fluid.layers.device import get_places
 import unittest
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 37b64fa94a9aad7042e153e414ed29de3142db5a..334294ab485cf203aa0ccf680a53010322d3af3b 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import contextlib
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index de6fe5f140a86545e3291db165af824739a814ef..9fe361425c128590da910128beaccb3336f8ba57 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import contextlib
@@ -60,7 +62,7 @@ def resnet_cifar10(input, depth=32):
         return tmp
 
     assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
+    n = (depth - 2) // 6
     conv1 = conv_bn_layer(
         input=input, ch_out=16, filter_size=3, stride=1, padding=1)
     res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index b7ac911cafdc751f38c7f66bc48263a17a84dc08..f63387a90617dc4e9b7c9ee7caa2d01595237a03 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import contextlib
 import math
 import numpy as np
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 462faad3e1cb7108f3bd6934017efe25fb9a4276..5e241aaa32727686b84a0354a11d5a92f9576a90 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import contextlib
 
 import numpy as np
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 3e5f76d12d41d016c995e5c85feda3c1847e356f..da216d0cc4a2867cb169240d28235b6db747a818 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid.core as core
 import math
 import os
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index b30c8771fcf267260c4c5aa7076bedc89e3b7e8b..cf8c48f34697d789d3d81d4d94f90a7169657baf 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import math
 import sys
 import os
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 2e79be2bd0fc7a368df86e188b7fa616055bb3e7..91c8705aa4c88dbfeea45e15c368459ba5b5ac1f 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import paddle
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index e761e05795313da23a9d984263ac2e202939b1e7..fe063eb4629dbe06dc65ce98c6c01858db901f03 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index ccc62b442f62fa9fa175de031b0732febe38ee9a..f530f8f4882a23df18c141b51560cb618fce86b5 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import math
 import sys
 
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index b2a59d27da9b3348b581d51a68d769bbf3b90d35..3951e7b8ca649b63eea4b311f6205a6c7d761804 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import sys
 
 import paddle
@@ -56,7 +58,7 @@ def resnet_cifar10(input, depth=32):
         return tmp
 
     assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
+    n = (depth - 2) // 6
     conv1 = conv_bn_layer(
         input=input, ch_out=16, filter_size=3, stride=1, padding=1)
     res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index 323ddfb6911fdd57b32344933373189370005126..1ad51936b5b8f7c5149452d6033754a570c72654 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import paddle
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
index 3d92f50f0adeca79adefc291cdfba6a012fc2118..bd77779ce6ab5cf19e3e5ace3e51e39734b27c10 100644
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import errno
 import math
 import os
diff --git a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
index a00325d79be2eba4d7f770b5316c5857952fe272..45a104ec9625eacfcb87ea6eae619e3d71410da9 100644
--- a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import sys
 import paddle.fluid as fluid
 import paddle.v2 as paddle
diff --git a/python/paddle/fluid/tests/demo/file_reader/train.py b/python/paddle/fluid/tests/demo/file_reader/train.py
index bc3a6dc81d24afec66ed1489aead1cff79a59bca..5f5d2848da42e18f2a142faae0c89352344d8cee 100644
--- a/python/paddle/fluid/tests/demo/file_reader/train.py
+++ b/python/paddle/fluid/tests/demo/file_reader/train.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import numpy
 import sys
diff --git a/python/paddle/fluid/tests/demo/pyreader.py b/python/paddle/fluid/tests/demo/pyreader.py
index 82065401935036ca346fa395c033f0f57100f01b..ec61e0ebae4feb1a2177da916b77b2ba2d3981b9 100644
--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ b/python/paddle/fluid/tests/demo/pyreader.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy
+import six
 
 import paddle
 import paddle.dataset.mnist as mnist
@@ -31,7 +34,7 @@ def network(is_train):
 
     hidden = img
 
-    for i in xrange(2):
+    for i in six.moves.xrange(2):
         hidden = fluid.layers.fc(input=hidden, size=100, act='tanh')
         hidden = fluid.layers.dropout(
             hidden, dropout_prob=0.5, is_test=not is_train)
@@ -74,7 +77,7 @@ def main():
 
     test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
 
-    for epoch_id in xrange(10):
+    for epoch_id in six.moves.xrange(10):
         train_reader.start()
         try:
             while True:
diff --git a/python/paddle/fluid/tests/no_test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py
index 3bc0c9808e2345b610dea79abc56cfb0065ea46f..b5d7676f4a2cb085c6900cd0bd0644afa2b2afd5 100644
--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/notest_concurrency.py b/python/paddle/fluid/tests/notest_concurrency.py
index 77107f8b36f31c1f494b0ade218ee047ef7eb7c6..fd9da4cce0ea51c53b4b01e7c3dc2a2ed1eeb089 100644
--- a/python/paddle/fluid/tests/notest_concurrency.py
+++ b/python/paddle/fluid/tests/notest_concurrency.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
index 8bf750940d570cdad5e110168afc5f632202e869..fe8a9daa3bea4b99bb42edc78538685c5ce11fe3 100644
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -15,6 +15,8 @@
 A simple machine translation demo using beam search decoder.
 """
 
+from __future__ import print_function
+
 import contextlib
 import numpy as np
 import paddle
diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py
index 6cc291dfcffdd7083f498389834e37bd06ca4572..b2a5253b9500bb504c651b2ab684206133199ada 100644
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import numpy as np
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
index 30b7a634a2b978df85d6432854ef12285460be44..01de564aa438e5f14a5c578f7bbbfb475155ca55 100644
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import unittest
 
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index fd45abd0a77cb54a3ca8e60cf80a1efe9f9d2060..1467e72caac26a3ea2a0c770d665141988696630 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index e8edd7fbbb31b1a6ecbf2a25a7d39e7b3f66363a..3c977afc7c813908fbe2dfb7445d9ca183cf2231 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import paddle
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py
index d530601f13be6810a8a99b13c92faf584df568f9..266687fcd092dfdeec9343e2592f4c22b683d588 100644
--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import paddle
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
index 082f64c146f65eee4be0757d07495c33764fa841..61d81f483636a99ea9e0282de89f12e47f3b824c 100644
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard
@@ -28,7 +30,8 @@ import numpy as np
 
 
 class TestMNISTIfElseOp(unittest.TestCase):
-    def test_raw_api(self):
+    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
+    def not_test_raw_api(self):
         prog = Program()
         startup_prog = Program()
         with program_guard(prog, startup_prog):
@@ -89,7 +92,8 @@ class TestMNISTIfElseOp(unittest.TestCase):
                     return
         self.assertFalse(True)
 
-    def test_ifelse(self):
+    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
+    def not_test_ifelse(self):
         prog = Program()
         startup_prog = Program()
         with program_guard(prog, startup_prog):
@@ -151,6 +155,13 @@ class TestIfElse(unittest.TestCase):
         self.cond_value = 0.5
         self.data = np.random.rand(25, 1).astype(np.float32)
 
+    def numpy_cal(self):
+        s1 = self.data[np.where(self.data < self.cond_value)]
+        res = np.sum(np.exp(s1))
+        s2 = self.data[np.where(self.data >= self.cond_value)]
+        res += np.sum(np.tanh(s2))
+        return res
+
     def compare_ifelse_op_and_numpy(self, place):
         self.set_test_case()
 
@@ -164,10 +175,12 @@ class TestIfElse(unittest.TestCase):
             ie = layers.IfElse(ifcond)
             with ie.true_block():
                 true_target = ie.input(src)
+                true_target = fluid.layers.exp(true_target)
                 ie.output(true_target)
 
             with ie.false_block():
                 false_target = ie.input(src)
+                false_target = fluid.layers.tanh(false_target)
                 ie.output(false_target)
             if_out = ie()
             out = layers.reduce_sum(if_out)
@@ -178,7 +191,8 @@ class TestIfElse(unittest.TestCase):
             o1, = exe.run(fluid.default_main_program(),
                           feed={'data': self.data},
                           fetch_list=[out])
-            o2 = np.sum(self.data)
+            o2 = self.numpy_cal()
+
             self.assertTrue(
                 np.allclose(
                     o1, o2, atol=1e-8),
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
index f7a9dd4129027417a06a6c25ff9a801fff259c5e..722b5f07b04f9374db3f262f5134347fe753ba19 100644
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor
 import numpy as np
diff --git a/python/paddle/fluid/tests/test_python_operator_overriding.py b/python/paddle/fluid/tests/test_python_operator_overriding.py
index b5ac97eac559e8c52a8949cfd63fc8671ba52514..5f92c437ec726f510d9194d23f1a01a5478827d6 100644
--- a/python/paddle/fluid/tests/test_python_operator_overriding.py
+++ b/python/paddle/fluid/tests/test_python_operator_overriding.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py
index b98a92dcbe5626c6cca93b3f5894302399793bf9..9ea95f3e8700274977eda4ca113a6468c631584c 100644
--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import unittest
 import time
@@ -54,7 +56,7 @@ class BenchmarkSuite(OpTest):
 
     def _get_input_names(self):
         inputs = []
-        for name, value in list(self.inputs.items()):
+        for name, value in six.iteritems(self.inputs):
             if isinstance(value, list):
                 inputs.extend([sub_name for sub_name, _ in value])
             inputs.append(name)
@@ -62,7 +64,7 @@ class BenchmarkSuite(OpTest):
 
     def _get_output_names(self):
         outputs = []
-        for var_name, var in list(self.outputs.items()):
+        for var_name, var in six.iteritems(self.outputs):
             if isinstance(var, list):
                 for sub_var_name, sub_var in var:
                     outputs.append(sub_var_name)
diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
index 91a5f1bca4441d80489a02eb9283928e38321826..0e7338b839e2a7f5808e7a752e9ca6389622c2cb 100644
--- a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
diff --git a/python/paddle/fluid/tests/unittests/decorators.py b/python/paddle/fluid/tests/unittests/decorators.py
index d1165e2a9199454dbcc1fda411afad20449bcc92..1a5f4540cf033b4d3244537cc5016ee06f341464 100644
--- a/python/paddle/fluid/tests/unittests/decorators.py
+++ b/python/paddle/fluid/tests/unittests/decorators.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 
 __all__ = ['many_times', 'prog_scope']
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 8f5ba33f7cbf5286edc4503c219fd3cdff60c517..85a96c0b53f6bc08687965048d6251265055a6fe 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import argparse
 import time
@@ -44,7 +46,8 @@ def cnn_model(data):
         pool_size=2,
         pool_stride=2,
         act="relu",
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant()))
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.3)))
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
@@ -52,7 +55,8 @@ def cnn_model(data):
         pool_size=2,
         pool_stride=2,
         act="relu",
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant()))
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.2)))
 
     SIZE = 10
     input_shape = conv_pool_2.shape
@@ -64,8 +68,7 @@ def cnn_model(data):
         size=SIZE,
         act="softmax",
         param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale, seed=1)))
+            initializer=fluid.initializer.Constant(value=0.1)))
     return predict
 
 
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index d576a173ce2546119ede49128ef69d240c7cf482..0387e911880256ea6b8efb6f2311bbf4c4f8c0f2 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import argparse
-import six
 import time
 import math
 
@@ -128,7 +129,12 @@ class SE_ResNeXt():
             input=conv, pool_size=7, pool_type='avg', global_pooling=True)
         drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
         stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
-        out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
+        out = fluid.layers.fc(
+            input=drop,
+            size=class_dim,
+            act='softmax',
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.2)))
         return out
 
     def shortcut(self, input, ch_out, stride):
@@ -173,12 +179,12 @@ class SE_ResNeXt():
             num_filters=num_filters,
             filter_size=filter_size,
             stride=stride,
-            padding=(filter_size - 1) / 2,
+            padding=(filter_size - 1) // 2,
             groups=groups,
             act=None,
             # avoid pserver CPU init differs from GPU
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant()),
+                initializer=fluid.initializer.Constant(value=0.2)),
             bias_attr=False)
         return fluid.layers.batch_norm(input=conv, act=act)
 
@@ -187,7 +193,7 @@ class SE_ResNeXt():
             input=input, pool_size=0, pool_type='avg', global_pooling=True)
         stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
         squeeze = fluid.layers.fc(input=pool,
-                                  size=num_channels / reduction_ratio,
+                                  size=num_channels // reduction_ratio,
                                   act='relu')
         stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
         excitation = fluid.layers.fc(input=squeeze,
@@ -227,10 +233,8 @@ class DistSeResneXt2x2(TestDistRunnerBase):
         lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
 
         optimizer = fluid.optimizer.Momentum(
-            # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed.
-            #learning_rate=fluid.layers.piecewise_decay(
-            #    boundaries=bd, values=lr),
-            learning_rate=base_lr,
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
             momentum=0.9,
             regularization=fluid.regularizer.L2Decay(1e-4))
         optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index ee8020a73546cb9037e9dc4be589c62bb1b6b937..239adcb9d5900d4073a6c07cb189ab7503aea86e 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import argparse
 import time
@@ -22,6 +24,7 @@ import paddle.fluid as fluid
 from paddle.fluid import core
 import os
 import sys
+import six
 import transformer_model
 import paddle.dataset.wmt16 as wmt16
 
@@ -159,6 +162,7 @@ def get_model():
     avg_cost = transformer(use_feed=False)
     optimizer = fluid.optimizer.Adam()
     optimizer.minimize(avg_cost)
+    fluid.memory_optimize(fluid.default_main_program())
     return avg_cost
 
 
@@ -222,7 +226,7 @@ class DistTransformer2x2(object):
 
         first_loss, = exe.run(fetch_list=[avg_cost.name])
         print(first_loss)
-        for i in xrange(5):
+        for i in six.moves.xrange(5):
             _ = exe.run(fetch_list=[avg_cost.name])
         last_loss, = exe.run(fetch_list=[avg_cost.name])
         print(last_loss)
@@ -261,9 +265,9 @@ def main(role="pserver",
 
 
 if __name__ == "__main__":
-    if len(sys.argv) != 7:
+    if len(sys.argv) != 8:
         print(
-            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
         )
     role = sys.argv[1]
     endpoints = sys.argv[2]
@@ -271,6 +275,8 @@ if __name__ == "__main__":
     current_endpoint = sys.argv[4]
     trainers = int(sys.argv[5])
     is_dist = True if sys.argv[6] == "TRUE" else False
+    # FIXME(typhoonzero): refine this test.
+    is_async = True if sys.argv[7] == "TRUE" else False
     main(
         role=role,
         endpoints=endpoints,
diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py
index 54a70f4adb4a9bb24e3c618a7fe71f42a376609b..0ad994a258c04cabc807823b7d2a8ae8bb62ab2c 100644
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import argparse
 import time
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index b27d773f09d9a6daad5a10b65e683f4e11881de1..972e44c9528a29417d9689dcb2408b9381346f31 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import random
+import six
 import time
 import itertools
 import collections
@@ -26,15 +29,13 @@ from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, OpProtoHolder, Variable
 from testsuite import create_op, set_input, append_input_output, append_loss_ops
-from functools import reduce
-from six.moves import zip
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
     prob = np.random.uniform(
         0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
     prob_sum = prob.sum(axis=1)
-    for i in range(len(prob)):
+    for i in six.moves.xrange(len(prob)):
         prob[i] /= prob_sum[i]
     return prob
 
@@ -51,7 +52,7 @@ def get_numeric_gradient(place,
     set_input(scope, op, inputs, place)
 
     def product(dim):
-        return reduce(lambda a, b: a * b, dim, 1)
+        return six.moves.reduce(lambda a, b: a * b, dim, 1)
 
     def get_output():
         sum = []
@@ -103,7 +104,7 @@ def get_numeric_gradient(place,
 
     # we only compute gradient of one element each time.
     # we use a for loop to compute the gradient of every element.
-    for i in range(tensor_size):
+    for i in six.moves.xrange(tensor_size):
         if in_place:
             set_input(scope, op, inputs, place)
 
@@ -161,7 +162,7 @@ class OpTest(unittest.TestCase):
             assert isinstance(
                 numpy_dict,
                 dict), "self.inputs, self.outputs must be numpy_dict"
-            for var_name, var_value in numpy_dict.items():
+            for var_name, var_value in six.iteritems(numpy_dict):
                 if isinstance(var_value, (np.ndarray, np.generic)):
                     self.try_call_once(var_value.dtype)
                 elif isinstance(var_value, (list, tuple)):
@@ -225,7 +226,7 @@ class OpTest(unittest.TestCase):
 
     def _get_io_vars(self, block, numpy_inputs):
         inputs = {}
-        for name, value in numpy_inputs.items():
+        for name, value in six.iteritems(numpy_inputs):
             if isinstance(value, list):
                 var_list = [
                     block.var(sub_name) for sub_name, sub_value in value
@@ -268,7 +269,7 @@ class OpTest(unittest.TestCase):
         # if the fetch_list is customized by user, we use it directly.
         # if not, fill the fetch_list by the user configured outputs in test.
         if len(fetch_list) == 0:
-            for var_name, var in outputs.items():
+            for var_name, var in six.iteritems(outputs):
                 if isinstance(var, list):
                     for v in var:
                         fetch_list.append(v)
@@ -366,12 +367,13 @@ class OpTest(unittest.TestCase):
         for place in places:
             outs = self.calc_output(place)
             outs = [np.array(out) for out in outs]
+            outs.sort(key=len)
             checker(outs)
 
     def __assert_is_close(self, numeric_grads, analytic_grads, names,
                           max_relative_error, msg_prefix):
 
-        for a, b, name in zip(numeric_grads, analytic_grads, names):
+        for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
             abs_a = np.abs(a)
             abs_a[abs_a < 1e-3] = 1
 
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 67c35e9de7e83699bf30ca946856bb907152cbdd..74e9d5c5f91e53a315c85d428571ce45bacede8a 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import multiprocessing
 import os
 import unittest
@@ -36,7 +38,8 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   seed=None,
                                   use_parallel_executor=True,
                                   use_reduce=False,
-                                  optimizer=fluid.optimizer.Adam):
+                                  optimizer=fluid.optimizer.Adam,
+                                  use_fast_executor=False):
         def run_executor(exe, feed, fetch_list, program=None):
             if isinstance(exe, fluid.ParallelExecutor):
                 res = exe.run(fetch_list=fetch_list, feed=feed)
@@ -69,6 +72,8 @@ class TestParallelExecutorBase(unittest.TestCase):
             startup_exe.run(startup)
             exec_strategy = fluid.ExecutionStrategy()
             exec_strategy.allow_op_delay = allow_op_delay
+            if use_fast_executor:
+                exec_strategy.use_experimental_executor = True
 
             build_strategy = fluid.BuildStrategy()
             build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
index db1861fd10e371ebe631a16380af591875886769..1b2b53f2d4ce91ae7b5b191ed770b5338f0948c8 100644
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
index 7d554c2276c9acd710d14c8f8b32c802e3e17515..611d0dd076b827b0f528f2e3a31182cc4939d1f1 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 34f9cf0620fd1351111e93e16ed5f7e765d7078b..30651c1326328180592520447e597aa722146a42 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
index 1b892e64c7654a1a3905672813452650885790a5..969a7da3b71b69296f3313342adbf989c60edb50 100644
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
index 2f0ea79f4d6afe91ee7e0d747f3d8f4884d8f9ee..fc3b7ce2fd87afc22030bcca55236fb949c1f129 100644
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index fa4b39879c0ede569b6802502b2c71a93b163373..5318d2f9766ce671925be614feef57d679270b19 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_op.py b/python/paddle/fluid/tests/unittests/test_adamax_op.py
index 8099beefa583d152715334e83f0c6e8e4a3e7a0d..a6d1be7616c73019cd8f66dcf0c108cd58ec600b 100644
--- a/python/paddle/fluid/tests/unittests/test_adamax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
index 9c7d5d41f0c512a9fb609dce304c1eed929d28b5..d31eaa0114c3b035add3e6ca792696b5cafb9690 100644
--- a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
+++ b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
index e04412f809cdd75d07d28a60f0c2f19041a684f6..0712e102b30fc72c7f8b62eb9230e7f4ab615ef0 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
index b29a102a3880406156481fdac54ca7043d3415db..7bc6f2599d617b192908da9b57d0cd715019bd71 100644
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
index 0000fb0958a129e9e1098de1fad888c503cfbdc5..b86d0bc43a9f84988f2b1b27f7aeffce46a46bd9 100644
--- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index e93c02bd3ee9f710cbb9bff4e195dfc3caabe422..ba2eecfaf197ea63c187e77ae7ae8cf34873d66b 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import op_test
 import numpy
 import unittest
diff --git a/python/paddle/fluid/tests/unittests/test_assign_value_op.py b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
index 02f2e6eddc80fcce4ca5a444cff82db355c085ca..5a9d8efef1f3e5a9e116720c2ffe32c2ef0a082f 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import op_test
diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py
index 6580c70ca68c4ba24919f03d071f6f88fb68953c..5393a17e674a3cad6d705a1ff7a45320e644af94 100644
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
index 18fa5461590134d2032a29e40699109c12092c6d..1286cee8dc1855c1b1695da46ae0b5222c065114 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index f805fdc35f624bf6e9d94d66839dcb2a0143a29b..80261eff4e747f87658bc7c9114c21bee511df09 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index 4a3ac2a31e072eb1a15af31f558cf9f626a7ac53..51eee41ab2d4d1113426991c63bee949cca15ad4 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index e8283fc9422d93af5735aaec1a165b46ac1ef78e..c28dda4b53ce5d394ff11222e5df8d257b4e80da 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import logging
 from paddle.fluid.op import Operator, DynamicRecurrentOp
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index b04f25ef874cc6204211a4f5f5991a0ec8c473dd..bed847c3c168c906a89c32631b2a8f0ba2e6e7be 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
index d20a11e27eaac12534ea33c398247adb8db01d4b..46831119c5fee938780ec8fdb9d0cdb3b63a473d 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index ceeca25b74d85ed2874d672e402e3186c4ce7d47..5cc8e2ba15d260b988ee66a5711aed42ca04c10b 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -11,6 +11,8 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index 4ce9a4783e2332b6882164a70e1462c6a6d31bef..2511c5c22e012babdeb71a71d3546456ea2ceaf3 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
index 7f2a9e6971ed933463216e38498d48ab132a1a37..4120a18b72f87c7e750a0fb68780292b58e3a7f4 100644
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
index b8d3ed3aa3eb0e47e79f46cdf681a3b9cca46036..71a2ccb6da47588d84c263105560626435ac461a 100644
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import op_test
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
index 354110f1f96f6b4aad1a4866c8d1337dec3acd16..48eb8e9f7585d41d541ac3645e9a50dc79058de7 100644
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
index 129958fa2818418dccce91683a9424e6324c6ac2..6103c3aafc0bb154194314830c5c8c5d89460cfe 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 3df80c8ec8fa39c0aaa2b8726fe3b37aef488442..32677bdb4c897b4e20f8fb166b080ac6e6a221b7 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index 405afebae85eaae6f6af0012058ad58c8bb69a2f..437ad35538a5fa380f950fd3b71e334276214ec7 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import op_test
 import unittest
 import numpy
diff --git a/python/paddle/fluid/tests/unittests/test_compat.py b/python/paddle/fluid/tests/unittests/test_compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c2c46f99a82875b917a330d6ec76062222420de
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_compat.py
@@ -0,0 +1,505 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.compat as cpt
+import six
+
+
+class TestCompatible(unittest.TestCase):
+    def test_type(self):
+        if six.PY2:
+            self.assertEqual(cpt.int_type, int)
+            self.assertEqual(cpt.long_type, long)
+        else:
+            self.assertEqual(cpt.int_type, int)
+            self.assertEqual(cpt.long_type, int)
+
+    def test_to_text(self):
+        # Only support python2.x and python3.x now
+        self.assertTrue(six.PY2 | six.PY3)
+
+        if six.PY2:
+            # check None
+            self.assertIsNone(cpt.to_text(None))
+
+            # check all string related types
+            self.assertTrue(isinstance(cpt.to_text(str("")), unicode))
+            self.assertTrue(isinstance(cpt.to_text(str("123")), unicode))
+            self.assertTrue(isinstance(cpt.to_text(b""), unicode))
+            self.assertTrue(isinstance(cpt.to_text(b""), unicode))
+            self.assertTrue(isinstance(cpt.to_text(u""), unicode))
+            self.assertTrue(isinstance(cpt.to_text(u""), unicode))
+
+            self.assertEqual(u"", cpt.to_text(str("")))
+            self.assertEqual(u"123", cpt.to_text(str("123")))
+            self.assertEqual(u"", cpt.to_text(b""))
+            self.assertEqual(u"123", cpt.to_text(b"123"))
+            self.assertEqual(u"", cpt.to_text(u""))
+            self.assertEqual(u"123", cpt.to_text(u"123"))
+
+            # check list types, not inplace
+            l = [""]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u"", u"123"], l2)
+            l = ["", b'123', u"321"]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u"", u"123", u"321"], l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, unicode))
+
+            # check list types, inplace
+            l = [""]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u"", u"123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([u"", u"123", u"321"], l2)
+
+            # check set types, not inplace
+            l = set("")
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(u""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([u"", u"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([u"", u"123", u"321"]), l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, unicode))
+
+            # check set types, inplace
+            l = set("")
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(u""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([u"", u"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([u"", u"123", u"321"]), l2)
+
+        elif six.PY3:
+            self.assertIsNone(cpt.to_text(None))
+
+            self.assertTrue(isinstance(cpt.to_text(str("")), str))
+            self.assertTrue(isinstance(cpt.to_text(str("123")), str))
+            self.assertTrue(isinstance(cpt.to_text(b""), str))
+            self.assertTrue(isinstance(cpt.to_text(b""), str))
+            self.assertTrue(isinstance(cpt.to_text(u""), str))
+            self.assertTrue(isinstance(cpt.to_text(u""), str))
+
+            self.assertEqual("", cpt.to_text(str("")))
+            self.assertEqual("123", cpt.to_text(str("123")))
+            self.assertEqual("", cpt.to_text(b""))
+            self.assertEqual("123", cpt.to_text(b"123"))
+            self.assertEqual("", cpt.to_text(u""))
+            self.assertEqual("123", cpt.to_text(u"123"))
+
+            # check list types, not inplace
+            l = [""]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(["", "123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_text(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(["", "123", "321"], l2)
+
+            # check list types, inplace
+            l = [""]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([""], l2)
+            l = ["", b"123"]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(["", "123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(["", "123", "321"], l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, str))
+
+            # check set types, not inplace
+            l = set("")
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(set(["", "123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_text(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(set(["", "123", "321"]), l2)
+
+            # check set types, inplace
+            l = set("")
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(["", "123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_text(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(["", "123", "321"]), l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, str))
+
+    def test_to_bytes(self):
+        # Only support python2.x and python3.x now
+        self.assertTrue(six.PY2 | six.PY3)
+
+        if six.PY2:
+            # check None
+            self.assertIsNone(cpt.to_bytes(None))
+
+            # check all string related types
+            self.assertTrue(isinstance(cpt.to_bytes(str("")), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(str("123")), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
+
+            self.assertEqual(b"", cpt.to_bytes(str("")))
+            self.assertEqual(b"123", cpt.to_bytes(str("123")))
+            self.assertEqual(b"", cpt.to_bytes(b""))
+            self.assertEqual(b"123", cpt.to_bytes(b"123"))
+            self.assertEqual(b"", cpt.to_bytes(u""))
+            self.assertEqual(b"123", cpt.to_bytes(u"123"))
+
+            # check list types, not inplace
+            l = [""]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123"], l2)
+            l = ["", b'123', u"321"]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123", b"321"], l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, bytes))
+
+            # check list types, inplace
+            l = [""]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123", b"321"], l2)
+
+            # check set types, not inplace
+            l = set("")
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(b""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123", b"321"]), l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, bytes))
+
+            # check set types, inplace
+            l = set("")
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(b""), l2)
+            l = set([b"", b"123"])
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123", b"321"]), l2)
+
+        elif six.PY3:
+            self.assertIsNone(cpt.to_bytes(None))
+
+            self.assertTrue(isinstance(cpt.to_bytes(str("")), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(str("123")), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(b""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
+            self.assertTrue(isinstance(cpt.to_bytes(u""), bytes))
+
+            self.assertEqual(b"", cpt.to_bytes(str("")))
+            self.assertEqual(b"123", cpt.to_bytes(str("123")))
+            self.assertEqual(b"", cpt.to_bytes(b""))
+            self.assertEqual(b"123", cpt.to_bytes(b"123"))
+            self.assertEqual(b"", cpt.to_bytes(u""))
+            self.assertEqual(b"123", cpt.to_bytes(u"123"))
+
+            # check list types, not inplace
+            l = [""]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual([b""], l2)
+            l = ["", "123"]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual([b"", b"123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_bytes(l)
+            self.assertTrue(isinstance(l2, list))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual([b"", b"123", b"321"], l2)
+
+            # check list types, inplace
+            l = [""]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b""], l2)
+            l = ["", b"123"]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123"], l2)
+            l = ["", b"123", u"321"]
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, list))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual([b"", b"123", b"321"], l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, bytes))
+
+            # check set types, not inplace
+            l = set([""])
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(set([b""]), l2)
+            l = set([u"", u"123"])
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(set([b"", b"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_bytes(l, inplace=False)
+            self.assertTrue(isinstance(l2, set))
+            self.assertFalse(l is l2)
+            self.assertNotEqual(l, l2)
+            self.assertEqual(set([b"", b"123", b"321"]), l2)
+
+            # check set types, inplace
+            l = set("")
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set(b""), l2)
+            l = set([u"", u"123"])
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123"]), l2)
+            l = set(["", b"123", u"321"])
+            l2 = cpt.to_bytes(l, inplace=True)
+            self.assertTrue(isinstance(l2, set))
+            self.assertTrue(l is l2)
+            self.assertEqual(l, l2)
+            self.assertEqual(set([b"", b"123", b"321"]), l2)
+            for i in l2:
+                self.assertTrue(isinstance(i, bytes))
+
+    def test_round(self):
+        self.assertEqual(3.0, cpt.round(3.4))
+        self.assertEqual(4.0, cpt.round(3.5))
+        self.assertEqual(0.0, cpt.round(0.1))
+        self.assertEqual(0.0, cpt.round(0.0))
+        self.assertEqual(-0.0, cpt.round(-0.0))
+        self.assertEqual(-0.0, cpt.round(-0.1))
+        self.assertEqual(-3.0, cpt.round(-3.4))
+        self.assertEqual(-4.0, cpt.round(-3.5))
+        self.assertEqual(5.0, cpt.round(5))
+        self.assertRaises(TypeError, cpt.round, None)
+
+    def test_floor_division(self):
+        self.assertEqual(0.0, cpt.floor_division(3, 4))
+        self.assertEqual(1.0, cpt.floor_division(4, 3))
+        self.assertEqual(2.0, cpt.floor_division(6, 3))
+        self.assertEqual(-2.0, cpt.floor_division(-4, 3))
+        self.assertEqual(-2.0, cpt.floor_division(-6, 3))
+        self.assertRaises(ZeroDivisionError, cpt.floor_division, 3, 0)
+        self.assertRaises(TypeError, cpt.floor_division, None, None)
+
+    def test_get_exception_message(self):
+        exception_message = "test_message"
+        self.assertRaises(AssertionError, cpt.get_exception_message, None)
+        if six.PY2:
+            self.assertRaises(AttributeError, cpt.get_exception_message,
+                              exception_message)
+            try:
+                raise RuntimeError(exception_message)
+            except Exception as e:
+                self.assertEqual(exception_message,
+                                 cpt.get_exception_message(e))
+                self.assertIsNotNone(e)
+
+            try:
+                raise Exception(exception_message)
+            except Exception as e:
+                self.assertEqual(exception_message,
+                                 cpt.get_exception_message(e))
+                self.assertIsNotNone(e)
+
+        if six.PY3:
+            try:
+                raise RuntimeError(exception_message)
+            except Exception as e:
+                self.assertEqual(exception_message,
+                                 cpt.get_exception_message(e))
+                self.assertIsNotNone(e)
+
+            try:
+                raise Exception(exception_message)
+            except Exception as e:
+                self.assertEqual(exception_message,
+                                 cpt.get_exception_message(e))
+                self.assertIsNotNone(e)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index e9f3c45dc40b3333fe7304f8e4313d156bd5374c..436ab7d49f4cafcd30366ae57c40d49e6f7d614f 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py
index 77869a1242e08d348bfb1031b8f5b1ab5c81d868..5b2b71d050c42b4fea84bab89824d3f5c164b36e 100644
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_const_value.py b/python/paddle/fluid/tests/unittests/test_const_value.py
index 58ac6fa0a9a30a08a831111513777cca59062724..0b2431d7726e845da33f6bcf9c74058788dd9654 100644
--- a/python/paddle/fluid/tests/unittests/test_const_value.py
+++ b/python/paddle/fluid/tests/unittests/test_const_value.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.framework as framework
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
index d0de7ad52c8a851c16cbbbf544d479f696dee136..1902a9869807ba7ce3f9828c124256cc6752857e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index bb1cd87d615fa341b7244e9f3e113b9fb4765ac2..6a2732e9399aa5a93f4c47eb73bfd23dba608c3d 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -24,12 +26,12 @@ def conv2d_forward_naive(input, filter, group, conv_param):
     out_c, f_c, f_h, f_w = filter.shape
     assert f_c * group == in_c
     assert np.mod(out_c, group) == 0
-    sub_out_c = out_c / group
+    sub_out_c = out_c // group
 
     stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
         'dilation']
-    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) / stride[0]
-    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) / stride[1]
+    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
     out = np.zeros((in_n, out_c, out_h, out_w))
 
     d_bolck_h = (dilation[0] * (f_h - 1) + 1)
@@ -138,7 +140,7 @@ class TestConv2dOp(OpTest):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
 
     def init_dilation(self):
@@ -157,7 +159,7 @@ class TestWithPad(TestConv2dOp):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
 
 
@@ -167,7 +169,7 @@ class TestWithStride(TestConv2dOp):
         self.stride = [2, 2]
         self.input_size = [2, 3, 6, 6]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
 
 
@@ -182,7 +184,7 @@ class TestWith1x1(TestConv2dOp):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1]
 
     def init_group(self):
@@ -195,7 +197,7 @@ class TestWithDilation(TestConv2dOp):
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
 
     def init_dilation(self):
@@ -211,7 +213,7 @@ class TestWithInput1x1Filter1x1(TestConv2dOp):
         self.stride = [1, 1]
         self.input_size = [2, 3, 1, 1]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1]
 
     def init_group(self):
@@ -328,7 +330,7 @@ class TestDepthwiseConv(TestConv2dOp):
         self.input_size = [2, 3, 5, 5]  # NCHW
         self.groups = 3
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
         self.op_type = "depthwise_conv2d"
 
@@ -340,7 +342,7 @@ class TestDepthwiseConv2(TestConv2dOp):
         self.input_size = [2, 3, 5, 5]  # NCHW
         self.groups = 3
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
         self.op_type = "depthwise_conv2d"
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index af6cd99b0d7e6b0a2dfd4fc1d33e8390017a5906..2a320e735bd7db5dc138f8263ba1b5cb115ba197 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -25,7 +27,7 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
     groups = attrs['groups']
     assert in_c == f_c
     out_c = f_out_c * groups
-    sub_in_c = in_c / groups
+    sub_in_c = in_c // groups
 
     stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
         'dilations']
@@ -258,7 +260,7 @@ class TestDepthwiseConvTranspose(TestConv2dTransposeOp):
         self.input_size = [2, 8, 16, 16]  # NCHW
         self.groups = 8
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [self.input_size[1], f_c, 4, 4]
         self.op_type = "depthwise_conv2d_transpose"
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index dd4ef7cc94ea1e8de5fe4775408389907d47d0d6..ddaf99fe061205f0f2e4c592c9e28e27e657c16a 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -24,14 +26,14 @@ def conv3d_forward_naive(input, filter, group, conv_param):
     out_c, f_c, f_d, f_h, f_w = filter.shape
     assert f_c * group == in_c
     assert np.mod(out_c, group) == 0
-    sub_out_c = out_c / group
+    sub_out_c = out_c // group
 
     stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
         'dilations']
 
-    out_d = 1 + (in_d + 2 * pad[0] - (dilation[0] * (f_d - 1) + 1)) / stride[0]
-    out_h = 1 + (in_h + 2 * pad[1] - (dilation[1] * (f_h - 1) + 1)) / stride[1]
-    out_w = 1 + (in_w + 2 * pad[2] - (dilation[2] * (f_w - 1) + 1)) / stride[2]
+    out_d = 1 + (in_d + 2 * pad[0] - (dilation[0] * (f_d - 1) + 1)) // stride[0]
+    out_h = 1 + (in_h + 2 * pad[1] - (dilation[1] * (f_h - 1) + 1)) // stride[1]
+    out_w = 1 + (in_w + 2 * pad[2] - (dilation[2] * (f_w - 1) + 1)) // stride[2]
 
     out = np.zeros((in_n, out_c, out_d, out_h, out_w))
 
@@ -166,7 +168,7 @@ class TestConv3dOp(OpTest):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3, 3]
 
     def init_dilation(self):
@@ -185,7 +187,7 @@ class TestCase1(TestConv3dOp):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3, 3]
 
 
@@ -205,7 +207,7 @@ class TestWith1x1(TestConv3dOp):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1, 1]
 
     def init_dilation(self):
@@ -221,7 +223,7 @@ class TestWithInput1x1Filter1x1(TestConv3dOp):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 1, 1, 1]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1, 1]
 
     def init_dilation(self):
@@ -237,7 +239,7 @@ class TestWithDilation(TestConv3dOp):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 6, 6, 6]  # NCDHW
         assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
+        f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 2, 2, 2]
 
     def init_dilation(self):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index 300fa5e8bde001e0f66c5f924a81c30add99aead..8d9075961cbec32bc34fcf0c92cfbb7e6c00d886 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -25,7 +27,7 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
     groups = attrs['groups']
     assert in_c == f_c
     out_c = f_out_c * groups
-    sub_in_c = in_c / groups
+    sub_in_c = in_c // groups
 
     stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
         'dilations']
diff --git a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
index 9fdb7baa90d2184c3c439e76b6bb5f0668f5f9ee..b7364e869e7420e610363eafcc4964b825e57326 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -21,7 +23,7 @@ def conv_shift_forward(x, y):
     out = np.zeros_like(x)
     M = x.shape[1]
     N = y.shape[1]
-    y_half_width = (N - 1) / 2
+    y_half_width = (N - 1) // 2
     for i in range(M):
         for j in range(N):
             out[:, i] += x[:, (i + j + M - y_half_width) % M] * y[:, j]
diff --git a/python/paddle/fluid/tests/unittests/test_cos_sim_op.py b/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
index 1b27cd57670e6c9db5eae6b226989a5c772866ce..3c3fd6d4d71503ccc3678ca69d55bcc8536c8c6a 100644
--- a/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
index 07c89eefc32fab37ce093e91d96fbe4471ecddc6..fd34c8fc9390b69afd93229b56aa9189da2a8b28 100644
--- a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
+++ b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.layers as layers
 
diff --git a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
index 122b076c2d3e3a69f52a2c335e2bc89707b4fa9b..51bd1300e61d58c934a40abf81ab8f137e44910f 100644
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import random
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
index 4016089c01644f0389855ab114360f90c50a1bbe..d7bcfba8deab1b73e4cbab8a27f9eeef9a37d29b 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
index 86ac159323a5f9f6149ce5ed4437402eb885c6bc..fa367f95fc9c65dd782d53a2799cacadf74dcfd2 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest, randomize_probability
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index 131b4076f45ae25b45bb3f64da07a5c3aacc43d5..5f17d2d407cca9a4c95d919d05a3a03b784d1942 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import sys
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
index 04e7f0b94510987a1872c2d625ac4d29a3c6feba..13a4eacece8a211513d6537db0d09b80c238178e 100644
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
index 951282e8bab5018204c0d31caa10f8f84a8f3d6c..e39eedd282daf6bbe0603a22c357e06c95c086b6 100644
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import numpy as np
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py
index 870952f2f916dcdec5991ac5c10d2da3a7ab18a8..f4c9466d63a201ba9a5e77515ae64a33bedc5b23 100644
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
index 84c44d4817366518a2cbc3f0a777ab32b67f3d11..a664a1529f4de1f372241319b57fad6b0ba8b8a2 100644
--- a/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
index 868bcca881a65dad7d0ecabb1e388818cdd0997e..01a7b6824885b32e922a8eb34f5d8117ee3e584f 100644
--- a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
+++ b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from paddle.fluid.default_scope_funcs import *
 import unittest
 
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
index 8603d3a5b3b5d368fe87b8dcf9dc7363f95caf86..fa6b67956259f33b109758c5939ab5729482695a 100644
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import argparse
 import time
@@ -25,6 +27,7 @@ import unittest
 from multiprocessing import Process
 import os
 import signal
+import six
 import collections
 
 SEED = 1
@@ -53,7 +56,8 @@ def cnn_model(data):
     # TODO(dzhwinter) : refine the initializer and random seed settting
     SIZE = 10
     input_shape = conv_pool_2.shape
-    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    param_shape = [six.moves.reduce(lambda a, b: a * b, input_shape[1:], 1)
+                   ] + [SIZE]
     scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
 
     predict = fluid.layers.fc(
@@ -106,7 +110,7 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
 
 
 def operator_equal(a, b):
-    for k, v in a.__dict__.iteritems():
+    for k, v in six.iteritems(a.__dict__):
         if isinstance(v, fluid.framework.Program) or \
                 isinstance(v, fluid.framework.Block):
             continue
@@ -116,8 +120,8 @@ def operator_equal(a, b):
                 raise ValueError("In operator_equal not equal:{0}\n".format(k))
 
         elif isinstance(v, collections.OrderedDict):
-            v0 = sorted(v.iteritems(), key=lambda x: x[0])
-            v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
+            v0 = sorted(six.iteritems(v), key=lambda x: x[0])
+            v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])
 
             if v0 != v1:
                 raise ValueError("In operator_equal not equal:{0}\n".format(k))
@@ -129,7 +133,7 @@ def operator_equal(a, b):
 
 
 def block_equal(a, b):
-    for k, v in a.__dict__.iteritems():
+    for k, v in six.iteritems(a.__dict__):
         if isinstance(v, core.ProgramDesc) or isinstance(
                 v, fluid.framework.Program) or isinstance(v, core.BlockDesc):
             continue
@@ -141,8 +145,8 @@ def block_equal(a, b):
             assert (len(a.ops) == len(b.ops))
 
         elif isinstance(v, collections.OrderedDict):
-            v0 = sorted(v.iteritems(), key=lambda x: x[0])
-            v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
+            v0 = sorted(six.iteritems(v), key=lambda x: x[0])
+            v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])
 
             if v0 != v1:
                 raise ValueError("In block_equal not equal:{0}\n".format(k))
@@ -154,7 +158,7 @@ def block_equal(a, b):
 
 
 def program_equal(a, b):
-    for k, v in a.__dict__.iteritems():
+    for k, v in six.iteritems(a.__dict__):
         if isinstance(v, core.ProgramDesc):
             continue
 
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index 8b66d1b270980a18fd1bbd068917e982a450ad6f..f6eb8f2c6d8b94f92e24ff789c91efb53a645a46 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
+import six
 import sys
 import collections
 import math
@@ -176,7 +179,7 @@ class TestDetectionMAPOp(OpTest):
             true_pos[label].append([score, tp])
             false_pos[label].append([score, fp])
 
-        for (label, label_pos_num) in list(label_count.items()):
+        for (label, label_pos_num) in six.iteritems(label_count):
             if label_pos_num == 0 or label not in true_pos: continue
             label_true_pos = true_pos[label]
             label_false_pos = false_pos[label]
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 4379463aca4443eb7a886ce78446440cc59f3b30..0e815c91446b285ba2c2c5aa9ad18d97f51eae65 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -11,11 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import time
 
 import unittest
 import os
 import sys
+import six
 import signal
 import subprocess
 import six
@@ -27,7 +30,7 @@ class TestDistRunnerBase(object):
             "get_model should be implemented by child classes.")
 
     def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
-                       trainers):
+                       trainers, sync_mode):
         # NOTE: import fluid until runtime, or else forking processes will cause error.
         import paddle
         import paddle.fluid as fluid
@@ -36,17 +39,22 @@ class TestDistRunnerBase(object):
             trainer_id=trainer_id,
             program=main_program,
             pservers=pserver_endpoints,
-            trainers=trainers)
+            trainers=trainers,
+            sync_mode=sync_mode)
         return t
 
-    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
-                    trainer_id):
+    def run_pserver(self,
+                    pserver_endpoints,
+                    trainers,
+                    current_endpoint,
+                    trainer_id,
+                    sync_mode=True):
         import paddle
         import paddle.fluid as fluid
         self.get_model(batch_size=2)
         t = self.get_transpiler(trainer_id,
                                 fluid.default_main_program(), pserver_endpoints,
-                                trainers)
+                                trainers, sync_mode)
         pserver_prog = t.get_pserver_program(current_endpoint)
         startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
         place = fluid.CPUPlace()
@@ -54,7 +62,13 @@ class TestDistRunnerBase(object):
         exe.run(startup_prog)
         exe.run(pserver_prog)
 
-    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+    def run_trainer(self,
+                    place,
+                    endpoints,
+                    trainer_id,
+                    trainers,
+                    is_dist=True,
+                    sync_mode=True):
         import paddle
         import paddle.fluid as fluid
         test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
@@ -62,7 +76,7 @@ class TestDistRunnerBase(object):
         if is_dist:
             t = self.get_transpiler(trainer_id,
                                     fluid.default_main_program(), endpoints,
-                                    trainers)
+                                    trainers, sync_mode)
             trainer_prog = t.get_trainer_program()
         else:
             trainer_prog = fluid.default_main_program()
@@ -103,9 +117,9 @@ def runtime_main(test_class):
     import paddle.fluid as fluid
     import paddle.fluid.core as core
 
-    if len(sys.argv) != 7:
+    if len(sys.argv) != 8:
         print(
-            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
         )
     role = sys.argv[1]
     endpoints = sys.argv[2]
@@ -113,31 +127,43 @@ def runtime_main(test_class):
     current_endpoint = sys.argv[4]
     trainers = int(sys.argv[5])
     is_dist = True if sys.argv[6] == "TRUE" else False
+    sync_mode = True if sys.argv[7] == "TRUE" else False
 
     model = test_class()
     if role == "pserver":
-        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id,
+                          sync_mode)
     else:
         p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
-        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist,
+                          sync_mode)
+
+
+import paddle.compat as cpt
 
 
 class TestDistBase(unittest.TestCase):
+    def _setup_config(self):
+        raise NotImplementedError("tests should have _setup_config implemented")
+
     def setUp(self):
         self._trainers = 2
         self._pservers = 2
         self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
         self._python_interp = "python"
+        self._sync_mode = True
+        self._setup_config()
 
     def start_pserver(self, model_file, check_error_log):
+        sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
             (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
-             self._trainers)
-        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+             self._trainers, sync_mode_str)
+        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
             (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
-             self._trainers)
+             self._trainers, sync_mode_str)
 
         ps0_pipe = subprocess.PIPE
         ps1_pipe = subprocess.PIPE
@@ -189,9 +215,10 @@ class TestDistBase(unittest.TestCase):
         # Run local to get a base line
         env_local = {"CUDA_VISIBLE_DEVICES": "0"}
         env_local.update(required_envs)
-        local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
+        sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
+        local_cmd = "%s %s trainer %s 0 %s %d FLASE %s" % \
             (self._python_interp, model_file,
-             "127.0.0.1:1234", "127.0.0.1:1234", 1)
+             "127.0.0.1:1234", "127.0.0.1:1234", 1, sync_mode_str)
         if not check_error_log:
             local_proc = subprocess.Popen(
                 local_cmd.split(" "),
@@ -209,7 +236,7 @@ class TestDistBase(unittest.TestCase):
 
         local_proc.wait()
         out, err = local_proc.communicate()
-        local_ret = out
+        local_ret = cpt.to_text(out)
         sys.stderr.write('local_loss: %s\n' % local_ret)
         sys.stderr.write('local_stderr: %s\n' % err)
 
@@ -220,12 +247,12 @@ class TestDistBase(unittest.TestCase):
         self._wait_ps_ready(ps1.pid)
 
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \
+        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE %s" % \
             (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
-             self._trainers)
-        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \
+             self._trainers, sync_mode_str)
+        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE %s" % \
             (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
-             self._trainers)
+             self._trainers, sync_mode_str)
 
         env0 = {"CUDA_VISIBLE_DEVICES": "0"}
         env1 = {"CUDA_VISIBLE_DEVICES": "1"}
@@ -256,7 +283,7 @@ class TestDistBase(unittest.TestCase):
         tr1_proc.wait()
         out, err = tr0_proc.communicate()
         sys.stderr.write('dist_stderr: %s\n' % err)
-        loss_data0 = out
+        loss_data0 = cpt.to_text(out)
         sys.stderr.write('dist_loss: %s\n' % loss_data0)
         lines = loss_data0.split("\n")
         dist_first_loss = eval(lines[0].replace(" ", ","))[0]
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index b3ccec9a7d65de57778a1f013465d41a5a267676..36bab6f04603b7ad3218603489eead859bfcb5b6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -11,14 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
 
 
-class TestDistSeResneXt2x2(TestDistBase):
+class TestDistMnist2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
     def test_se_resnext(self):
         self.check_with_place("dist_mnist.py", delta=1e-7)
 
 
+class TestDistMnistAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_mnist.py", delta=200)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index a33a338fc11e4301a8ec0eb565686d62b547b7f7..c0e9fa38e7d1eadd89eff9a8ba4442f888b8120e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -11,14 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
 
 
 class TestDistSeResneXt2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
     def test_se_resnext(self):
         self.check_with_place("dist_se_resnext.py", delta=1e-7)
 
 
+class TestDistSeResneXt2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_se_resnext.py", delta=100)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index 55aa923f5ab229bc8e9a0b891e0ac9c2ec49d31b..9581abdf394d738470d32ae609838832077ee519 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import os
 import time
 import unittest
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
index 68cd35d751dbce7eef9919dc8678fc0dd117757b..62fcf5953f93637a20beed649de21476a8673419 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -12,11 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 from test_dist_base import TestDistBase
 
 
 class TestDistTransformer2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
     def test_transformer(self):
         # TODO(paddle-dev): check if the delta is OK.
         # Usually start around ~8000 and converge to ~5000
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 124abf4ccde98d565b3286c72793c91fd26bb71c..9f04d290f7596a60d5fdfa66cbc4beec1c3fe93d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import math
 
 import unittest
@@ -45,10 +47,10 @@ class TranspilerTest(unittest.TestCase):
         avg_cost = fluid.layers.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
         sgd_optimizer.minimize(avg_cost)
-        return
 
     def get_main_program(self):
         main = fluid.Program()
+        main.random_seed = 1
         with fluid.program_guard(main):
             self.net_conf()
         self.origin_prog = main.clone()
@@ -92,8 +94,9 @@ class TranspilerTest(unittest.TestCase):
     def test_transpiler(self):
         main = fluid.Program()
         startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            self.transpiler_test_impl()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.transpiler_test_impl()
 
 
 class TestBasicModel(TranspilerTest):
@@ -246,7 +249,6 @@ class TestLRDecay(TranspilerTest):
                 decay_rate=0.1,
                 staircase=True))
         sgd_optimizer.minimize(avg_cost)
-        return
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -276,7 +278,6 @@ class TestLRDecayConditional(TranspilerTest):
             learning_rate=fluid.layers.piecewise_decay([10000, 20000],
                                                        [1.0, 0.5, 1.0]))
         sgd_optimizer.minimize(avg_cost)
-        return
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -325,7 +326,6 @@ class TestL2Decay(TranspilerTest):
         avg_cost = fluid.layers.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
         sgd_optimizer.minimize(avg_cost)
-        return
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -360,7 +360,6 @@ class TestL2DecayWithPiecewise(TranspilerTest):
             momentum=0.9,
             regularization=fluid.regularizer.L2Decay(1e-4))
         sgd_optimizer.minimize(avg_cost)
-        return
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -390,13 +389,14 @@ class TestDistLookupTableBase(TranspilerTest):
     def network_with_table(self, is_sparse, is_distributed):
         self.table_size = 1000
         self.emb_size = 64
+        self.lookup_table_name = 'shared_w'
 
         def emb_pool(ids):
             emb = fluid.layers.embedding(
                 input=ids,
                 size=[self.table_size, self.emb_size],
                 dtype='float32',
-                param_attr='shared_w',  # share parameter
+                param_attr=self.lookup_table_name,  # share parameter
                 is_sparse=is_sparse,
                 is_distributed=is_distributed)
             pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
@@ -569,7 +569,7 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase):
 
     def transpiler_test_impl(self):
         config = fluid.DistributeTranspilerConfig()
-        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config)
+        pserver1, _ = self.get_pserver(self.pserver1_ep, config)
 
         self.assertTrue(self.transpiler.has_distributed_lookup_table)
         lookup_table_var = pserver1.global_block().vars[
@@ -579,6 +579,21 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase):
         self.assertEqual(row_size, calc_row_size)
 
 
+class TestDistArgsInProgram(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        trainer, _ = self.get_trainer()
+
+        self.assertTrue(trainer._is_distributed)
+        self.assertTrue(trainer._is_chief)
+        self.assertEqual(trainer._distributed_lookup_table,
+                         self.lookup_table_name)
+        self.assertEqual(trainer._endpoints,
+                         [self.pserver1_ep, self.pserver2_ep])
+
+
 class TestRMSPropOptimizer(TranspilerTest):
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
@@ -592,7 +607,6 @@ class TestRMSPropOptimizer(TranspilerTest):
         avg_cost = fluid.layers.mean(cost)
         optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
         optimizer.minimize(avg_cost)
-        return
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -609,5 +623,40 @@ class TestRMSPropOptimizer(TranspilerTest):
         self.assertEqual(moment_var.shape, (500, 1000))
 
 
+class TestLoadSliceVar(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
+        optimizer.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        pserver, _ = self.get_pserver(self.pserver1_ep)
+        pserver2, _ = self.get_pserver(self.pserver2_ep)
+
+        self.assertTrue(pserver._slice_vars_and_attrs)
+        self.assertTrue(pserver2._slice_vars_and_attrs)
+
+        for idx in xrange(len(pserver._slice_vars_and_attrs)):
+            self.assertEqual(pserver._slice_vars_and_attrs[idx][0],
+                             pserver2._slice_vars_and_attrs[idx][0])
+
+            total_numel = reduce(lambda x, y: x * y,
+                                 pserver._slice_vars_and_attrs[idx][0].shape)
+            self.assertEqual(
+                total_numel,
+                reduce(lambda x, y: x * y,
+                       pserver._slice_vars_and_attrs[idx][2].shape) + reduce(
+                           lambda x, y: x * y,
+                           pserver2._slice_vars_and_attrs[idx][2].shape))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
index 543d0f9dc2c9b8cdcfaaaa14a7a4f197d210d951..38af149ad336fcb818c3cbc9c686bcbdf00238be 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -11,14 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
 
 
 class TestDistSeResneXt2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
     def test_se_resnext(self):
         self.check_with_place("dist_word2vec.py", delta=1e-7)
 
 
+class TestDistSeResneXt2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_word2vec.py", delta=1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index eaa3435a86462236a99489749abe877648677053..0296bc2af4e0b79478c34b4cceab32b5a8a50f2f 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index fdc6adc93bc2488d4faffed61fde5d54bbbbfd57..d84dab1499a267ca081c2e8ea2856c7c4bb627cb 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import paddle
 import unittest
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 7756885166c88eadb77c2c6d56aab767015abc51..9d635f36fe83d041bb57df0759da1481f66bbaa2 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy
 import random
 import collections
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index d182889a970fb178dec4976aebbd79d05dc3e91e..b4359fc69ae18b45774af0d2b20c1540bd99da5c 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
index 816562621b4fc749f3c6b0eca8ee3c5850ef1ba9..4d03523025d357e453848f3016ffee890b5d46ec 100644
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
index bcdbfc8e527d0dc9a95eddaf040f8035207b6c20..d85cc1f856df8eaa73cef318b48a292042488edf 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index fb9a496126f0b6efcad73590c78efe5a47b88cd6..5aec5d8e38aba39e6aba9a8f19637587c2f12544 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index bfe022af6dac711e76678b79005d4cfff90c2a2b..cadaf1df53af0af56afa8c3631b0f5ce390f318c 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
index 6f350044892a4ba2a985b5bc2328ab1fc20c5504..9f452ffde74ee18d14f155fb5ed53fee57f12f49 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
index b6cd18a579520f921feed48cc86d8027f6a7bd1e..43c58710ba50c27077942643b84b7642eaf57710 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
index 92099724fe65050b62c738f4e6c269a0ca3f4ef1..45c861e2c3df9f14f9886091012d6cca69944454 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 2742bb21d95ed3363e82a70b0172cc787878abd9..775c2253ab3b27708b745b85fc007fcb504d1aed 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
index a3fd18669c556701b2586e88ddbb89ca79549a86..7bf642f03f480b1eeec68298f9d453deb1fa2ac3 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index 1854232194963bcbe302010320a30d85747eea96..6cb88a8bb1cad7a58ca175cfc14298c959e3bad6 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_exception.py b/python/paddle/fluid/tests/unittests/test_exception.py
index bb7c0f88f6027807394e15aa6803da2ddc22f4e2..798ed53cddade22e986cae65109b6c6ac7a291b6 100644
--- a/python/paddle/fluid/tests/unittests/test_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_exception.py
@@ -12,19 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
+import paddle.compat as cpt
 import paddle.fluid.core as core
 import unittest
 
 
 class TestException(unittest.TestCase):
     def test_exception(self):
-        ex = None
+        exception = None
         try:
             core.__unittest_throw_exception__()
         except core.EnforceNotMet as ex:
-            self.assertIn("test exception", ex.message)
+            self.assertIn("test exception", cpt.get_exception_message(ex))
+            exception = ex
 
-        self.assertIsNotNone(ex)
+        self.assertIsNotNone(exception)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
index e1272c1d6dd7131b55ecf33fa0de0fc78a3ac5a7..b1f89eca6e58aec41b5863f4c885d5c6231a72f4 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import numpy
diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py
index a91e3aef5a18a681f3baf405da2beebb8c85360c..67a8d8f0721c2c75b432d68d64be8fc1035ffc74 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
index 6a41c44fe655b18626bdb727745dae032babe8ad..8629bcf0f2e3c37aefdbf79b203176a43e0c3a7e 100644
--- a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
index 026ac2112b2d78644b3315b9cab8019ca27e9714..d84ebed3fac67db323392494c701cf2a51b28305 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import math
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 6c6aa9d3bb656740c528c728efafc6a47e8bff91..cc0494774a5f2f24faaae65f193fc3ff9270d9ac 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
index 099e6e60642e9637f8f3648696e844c667e1c406..45951a34d6f61a242cb2dc004d6801a6c1c9dd92 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index 2bb920710a9b10f3a8159bad3b33dd15ffbada19..ff417ad2f16b83cd42a0603375c14450195e7fc0 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -64,27 +64,47 @@ class TestFCOp(OpTest):
         self.check_output()
 
 
-class TestFCOpBiasBoth(TestFCOp):
+class TestFCOpNoBias(TestFCOp):
     def init_shapes(self, mb, ic, oc, h, w):
-        for with_bias in {True, False}:
-            self.with_bias = with_bias
-            self.matrix = MatrixGenerate(mb, ic, oc, h, w)
+        self.with_bias = False
+        self.matrix = MatrixGenerate(mb, ic, oc, h, w)
 
 
-class TestFCOp1(TestFCOpBiasBoth):
+class TestFCOpWithBias(TestFCOp):
+    def init_shapes(self, mb, ic, oc, h, w):
+        self.with_bias = True
+        self.matrix = MatrixGenerate(mb, ic, oc, h, w)
+
+
+class TestFCOp1(TestFCOpNoBias):
     def init_op_type(self):
         self.init_shapes(2, 8, 10, 1, 1)
 
 
-class TestFCOp2(TestFCOpBiasBoth):
+class TestFCOp2(TestFCOpNoBias):
     def init_op_type(self):
         self.init_shapes(4, 5, 6, 2, 2)
 
 
-class TestFCOp4(TestFCOpBiasBoth):
+class TestFCOp4(TestFCOpNoBias):
     def init_op_type(self):
         self.init_shapes(1, 32, 64, 3, 3)
 
 
+class TestFCOpWithBias1(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(3, 8, 10, 2, 1)
+
+
+class TestFCOpWithBias2(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(4, 5, 6, 2, 2)
+
+
+class TestFCOpWithBias3(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(1, 64, 32, 3, 3)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
index 8b9da843115409c65055927d317867d1290c8f0e..b823d397e9530362f5fee417278e36477d65f6f5 100644
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid.core as core
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_var.py b/python/paddle/fluid/tests/unittests/test_fetch_var.py
index e6f37f0b4ca781e4ec83a00f8f2605ef02716bd7..de339d821b1329662469c26eacd234b74a102e13 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_var.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_var.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import op_test
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
index 0c75cf33f5f208d11081a6802910c25553b8c4ec..fdc8a118e56f4473da5ed60169daebec14c7c33c 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 5e2ddb218af8fcf4f686260296b57519ec7486b9..44fb1d047dff48d2554c0bf637afbfda725e0a02 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_fill_op.py b/python/paddle/fluid/tests/unittests/test_fill_op.py
index 762d29199e2127415ed7daabca63edcdbae3344f..b734ee05b3f2291d7a79f1550946bf6546ada6e0 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
index c9b3e4ba138f425fd2991bf637d2e32be3eb5168..eec73d0beb39c49f535a03532e536092001c8445 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py
index f8692ce2ea66ef61c63bc41e77df050398ac63fd..17b01e03124e8007c51107b414c628d4bfc49c79 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
diff --git a/python/paddle/fluid/tests/unittests/test_framework_debug_str.py b/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
index c906c74afe66b05e2ca0e1122677e2dc738351b8..72f43e56ccbe04f56cfd5a655fb57c58369039bb 100644
--- a/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
+++ b/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 from paddle.fluid.framework import Program
 
diff --git a/python/paddle/fluid/tests/unittests/test_ftrl_op.py b/python/paddle/fluid/tests/unittests/test_ftrl_op.py
index 5f7581391afb2bd57cb25695f3c0d4db8573c80c..a6390b054f06184831c289fe9556216ae213be7c 100644
--- a/python/paddle/fluid/tests/unittests/test_ftrl_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ftrl_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
index ec0a939e9ec21952a6657ea849bb9844bb69cc8d..97e1b9061afb738dd9e5f8b3b6a9c9a123c6aac6 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d8bef677fd16fb6bdc20b929137b4d885f4efd1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -0,0 +1,151 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_lstm_op import lstm, ACTIVATION
+
+
+def fc(x, w, b):
+    return np.dot(x, w) + b
+
+
+def fusion_lstm(
+        x,  # T x M
+        lod,  # 1 x N
+        wx=None,  # M x 4D
+        bx=None,  # 1 x 4D
+        h0=None,  # N x D
+        c0=None,  # N x D
+        w_h=None,  # D x 4D
+        w_b=None,  # 1 x 4D
+        w_c=None,  # 1 x 3D
+        is_reverse=False,
+        act_gate=None,
+        act_cell=None,
+        act_cand=None):
+    return lstm(
+        fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate,
+        act_cell, act_cand)
+
+
+class TestLstmOp(OpTest):
+    def set_argument(self):
+        self.lod = [[2, 3, 2]]
+
+    def setUp(self):
+        self.op_type = 'fusion_lstm'
+        self.lod = [[2, 3, 2]]
+        self.M = 8
+        self.D = 16
+        self.has_initial_state = False
+        self.is_reverse = False
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+        self.use_peepholes = False
+        self.set_argument()
+
+        T = sum(self.lod[0])
+        bs = len(self.lod[0])
+
+        x = np.random.normal(size=(T, self.M)).astype('float64')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(bs, self.D)).astype('float64')
+            c0 = np.random.normal(size=(bs, self.D)).astype('float64')
+        else:
+            h0 = np.zeros((bs, self.D)).astype('float64')
+            c0 = np.zeros((bs, self.D)).astype('float64')
+
+        wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float64')
+
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+        w_b = np.copy(b[:, 0:4 * self.D])
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
+
+        # this is the weight of fc
+        wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float64')
+        # this is the bias of fc
+        # and it should be manually added into the bias of this fusion LSTM
+        bx = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+        b[0, 0:4 * self.D] += bx[0, :]
+        h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c,
+                           self.is_reverse, ACTIVATION[self.act_gate],
+                           ACTIVATION[self.act_cell], ACTIVATION[self.act_cand])
+
+        self.inputs = {
+            'X': (x, self.lod),
+            'WeightX': wx,
+            'WeightH': wh,
+            'Bias': b
+        }
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        self.outputs = {
+            'Hidden': (h, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
+        }
+
+    def test_check_output(self):
+        self.check_output(atol=1e-8)
+
+
+class TestLstmOpInitReverse(TestLstmOp):
+    def set_argument(self):
+        self.has_initial_state = True
+        self.is_reverse = True
+
+
+class TestLstmOpMD1(TestLstmOp):
+    def set_argument(self):
+        self.M = 36
+        self.D = 8
+
+
+class TestLstmOpMD2(TestLstmOp):
+    def set_argument(self):
+        self.M = 8
+        self.D = 8
+
+
+class TestLstmOpMD3(TestLstmOp):
+    def set_argument(self):
+        self.M = 15
+        self.D = 3
+
+
+class TestLstmOpBS1(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[3]]
+        self.D = 16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 4ae90864806204197c52bbbdc5516f141afd4613..bd5785aa55af241fe42a1ae2c550dbdb980f42e2 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
index 1398166a74e714e0e902532166cde5d94ccae5f6..9a0631fa26a3e93c5c2115fd03a37de3fac46ce5 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
index 3ae877a60818744f852d3af9a02ffebf5e2affc8..9777ec390656d3f6166bf9f5de7bbad8b6bd786d 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 from test_gaussian_random_op import TestGaussianRandomOp
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 8481500fd78f0ccf34f09c66bec27e195b9aada3..496aa4111056591efce14549011d66f9ae49713a 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy
 
diff --git a/python/paddle/fluid/tests/unittests/test_get_places_op.py b/python/paddle/fluid/tests/unittests/test_get_places_op.py
index 964423e2d2638224244b4ca774d8eee08f3ec989..441666a97b16a320692d6a15363f61156e52242b 100644
--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
 import decorators
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 86a2c674d01f45b2b141572c8191d2fba7fa312f..001fd7efb159e60bdf3cd0698d85dea90ad71616 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import math
+import functools
 from op_test import OpTest
 from test_lstm_op import identity, sigmoid, tanh, relu
 
@@ -38,7 +41,8 @@ class TestGRUOp(OpTest):
         for i in range(len(seq_lens)):
             seq_starts.append(seq_starts[-1] + seq_lens[i])
         sorted_seqs = sorted(
-            list(range(len(seq_lens))), lambda x, y: seq_lens[y] - seq_lens[x])
+            list(range(len(seq_lens))),
+            key=functools.cmp_to_key(lambda x, y: seq_lens[y] - seq_lens[x]))
         num_batch = seq_lens[sorted_seqs[0]]
         for batch_idx in range(num_batch):
             idx_in_seq = []
diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
index 87a9eba4d97459082cdf1499efeddf24ed51e1b1..b5a66fdf086f2abc0c9a8af663241b9eda739407 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import math
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py b/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
index 70586c6be3da415fbccf4114615e6f7e08de0f0f..1eb441e2c52905c2b60104de5e04037714b34648 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index daa5da8d95129af0305b326832a557daeb4c5c9c..6948ae30023a75d4735db1c78466e89e28640c9e 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import math
diff --git a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
index a8d0a77625598e9d929a993db46ba95b0e07527a..0055ef0052fe126b268cf7a17a8307224cced99a 100644
--- a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
index 13bc5768740ece00bbe285a0b47d82bb8a42d2c7..833e46483c2532e283fd672dc56cb93941f5b4ba 100644
--- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
+++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
@@ -11,6 +11,8 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
index 23b1ed957ad15bb631cd5160eb48328c76302987..405637969af6fb515a24ecb077e470279c3ffc24 100644
--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_infer_shape.py b/python/paddle/fluid/tests/unittests/test_infer_shape.py
index 699a2d42467b7ac0dcf1939bde744ad2fcb29c97..a3d700aad8236fea7bb0e6d043323ad3bd0851f2 100644
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
+import six
 import paddle.fluid.core as core
 
 
@@ -27,14 +30,14 @@ class TestInferShape(unittest.TestCase):
         shape = [10, 20]
 
         # prepare input/output
-        x1 = block.var("x1")
+        x1 = block.var(six.b("x1"))
         x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x1.set_shape(shape)
-        x2 = block.var("x2")
+        x2 = block.var(six.b("x2"))
         x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x2.set_shape(shape)
 
-        out = block.var("out")
+        out = block.var(six.b("out"))
         out.set_type(core.VarDesc.VarType.LOD_TENSOR)
 
         # prepare the operator
@@ -57,14 +60,14 @@ class TestInferShape(unittest.TestCase):
         y_shape = [20, 30]
 
         # prepare input/output
-        x1 = block.var("x")
+        x1 = block.var(six.b("x"))
         x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x1.set_shape(x_shape)
-        x2 = block.var("y")
+        x2 = block.var(six.b("y"))
         x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x2.set_shape(y_shape)
 
-        out = block.var("out")
+        out = block.var(six.b("out"))
         out.set_type(core.VarDesc.VarType.LOD_TENSOR)
 
         # prepare the operator
diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
index 4cd203155f446df07d2fe6c1d56e0d20f1113679..9962702f69644b7aef7d868f086abb390441f617 100644
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
+import six
 import numpy as np
 import paddle.fluid.core as core
 
@@ -48,7 +51,7 @@ class TestBook(unittest.TestCase):
 
         exe.run(init_program, feed={}, fetch_list=[])
 
-        for i in range(100):
+        for i in six.moves.xrange(100):
             tensor_x = np.array(
                 [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
             tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
@@ -64,7 +67,7 @@ class TestBook(unittest.TestCase):
                                  'y': tensor_y},
                            fetch_list=[avg_cost])[0]
 
-        reload(executor)  # reload to build a new scope
+        six.moves.reload_module(executor)  # reload to build a new scope
         exe = executor.Executor(place)
 
         [infer_prog, feed_var_names, fetch_vars] = load_inference_model(
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index b215e379864e919af03591ab2566c08dddbb5743..ab7183f88df809e584ca50ba16221bfdfe1376a9 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import unittest
 
diff --git a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
index eff4212d91e609a7ef531280bbd3cf3671a59830..7c1808cf998e84c22c46df68ef07259c1a021c19 100644
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import numpy.random as random
diff --git a/python/paddle/fluid/tests/unittests/test_is_empty_op.py b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
index 11121d9b65351eab639b7618fac0e54714cf4680..26d607718aec0bdffa00b9b4bca06ec6c0196217 100644
--- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_l1_norm_op.py b/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
index fa5b18a16f7a3e734ff8bb4f53240e8a9ce8fd8f..4e24a78ee54dfb1fb0e4f97317642cfaffe9436e 100644
--- a/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import unittest
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
index ca21289a0d48a123aed90bc557ccc732702b47f1..62d385bc52cfb3a9fe15a82096ff33abc1bcc552 100644
--- a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
+++ b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 295887ccd171a3101329eb1255da146914fa9264..fb6c43136ff82af55d1fcc2969cf4a07ae081204 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 07fd0575d333dacf309620a883e4052c6126739f..8e707c8b00b7bf3c5ea77c18c18135e89ffab9c7 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -159,7 +159,7 @@ class TestBook(unittest.TestCase):
                 input=crf_decode,
                 label=label,
                 chunk_scheme="IOB",
-                num_chunk_types=(label_dict_len - 1) / 2)
+                num_chunk_types=(label_dict_len - 1) // 2)
             self.assertFalse(crf is None)
             self.assertFalse(crf_decode is None)
 
@@ -286,7 +286,7 @@ class TestBook(unittest.TestCase):
                     name='word_{0}'.format(i), shape=[1], dtype='int64'))
 
         dict_size = 10000
-        label_word = int(window_size / 2) + 1
+        label_word = int(window_size // 2) + 1
 
         embs = []
         for i in range(window_size):
@@ -347,6 +347,25 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(loss)
         print(str(program))
 
+    def test_scatter(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name='x',
+                shape=[3, 3],
+                append_batch_size=False,
+                dtype='float32')
+            idx = layers.data(
+                name='idx', shape=[2], append_batch_size=False, dtype='int32')
+            updates = layers.data(
+                name='updates',
+                shape=[2, 3],
+                append_batch_size=False,
+                dtype='float32')
+            out = layers.scatter(input=x, index=idx, updates=updates)
+            self.assertIsNotNone(out)
+        print(str(program))
+
     def test_lod_reset(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index e628195e7265ec564bd64a212c4a35fdff495063..0d3e6d73e0149fe633b8f1de9041068c2e3bb293 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import copy
 import math
 import unittest
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
index 696d0ab4fa81a409a2bf0d6f6f23779ec26eb6d2..6e31e9204e95d98fcf69ed84a46d6cf3d94c808a 100644
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import random
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 1cdc69501043d120b9e3cc8ccda3a1212d205886..48b52a5412eb99fbc7a5c8534a766ede4954e849 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle
 import paddle.fluid as fluid
 import os
diff --git a/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py b/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
index d8b4e40662568f580ccff0257512cb8809488f17..15485df5ac440f2ff666ca27ef8e8bcc5df866c0 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.layers as layers
 from paddle.fluid.executor import Executor
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
index d53ead381d301e797d5a19784aed49a5d6f99319..865ca118d55f82c66d44f4e3d553baafa0c14c3a 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from paddle.fluid.layers import data
 from paddle.fluid.layers.control_flow import lod_rank_table
 from paddle.fluid.executor import Executor
diff --git a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
index 77905c4b96499c855fd5c5e704b8051ccdb7a323..31f364a42f624c8662a5ae087b003ca0304ae419 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
index 0ac6d9b81df0ecbe9c6560cdb0ab0507c3c2ed18..6ad27de9a0e42d1a15ec4a17804c7c0f7ebf5d94 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 import numpy
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index 9789ff4af648b41a1b53844be89249bd260de61b..6a78ef5078a738efa2ae39ea23645fedaecce63b 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 import numpy
diff --git a/python/paddle/fluid/tests/unittests/test_log_loss_op.py b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
index d3980b8db93ca517d16b6f782ba800ce839c3f45..784f4f648d52bdf4f2357f4454d790a8d53288f3 100644
--- a/python/paddle/fluid/tests/unittests/test_log_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py
index 1d7dfe60f200459705a48664c1a5b22d2a5888d2..521851a3d57a4a3e8b2c8e1639325cc6c88fdd84 100644
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import op_test
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
index aa9eae1e882f55ef51f38e158317a1a9aeed641c..11e5d8b536fb65b66c954991bf815241774702ec 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -19,36 +21,27 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
-def output_hist(out):
-    hist, _ = np.histogram(out, range=(-5, 10))
-    hist = hist.astype("float32")
-    hist /= float(out.size)
-    prob = 0.1 * np.ones((10))
-    return hist, prob
-
-
 class TestLookupSpraseTable(OpTest):
     def check_with_place(self, place):
         scope = core.Scope()
 
-        # create and initialize Id Variable
-        ids = scope.var("Ids").get_tensor()
-        ids_array = np.array([0, 2, 3, 5, 100]).astype("int64")
-        ids.set(ids_array, place)
-
         # create and initialize W Variable
-        rows = [0, 1, 2, 3, 4, 5, 6]
-        row_numel = 10000
+        table_size = 10000
+        row_numel = 8
 
         w_selected_rows = scope.var('W').get_selected_rows()
-        w_selected_rows.set_height(len(rows))
-        w_selected_rows.set_rows(rows)
-        w_array = np.ones((len(rows), row_numel)).astype("float32")
-        for i in range(len(rows)):
+        w_selected_rows.set_height(table_size)
+        w_array = np.ones((table_size, row_numel)).astype("float32")
+        for i in range(table_size):
             w_array[i] *= i
         w_tensor = w_selected_rows.get_tensor()
         w_tensor.set(w_array, place)
 
+        # create and initialize Id Variable
+        ids = scope.var("Ids").get_tensor()
+        ids_array1 = np.array([0, 2, 3, 2, 5, 0, 100]).astype("int64")
+        ids.set(ids_array1, place)
+
         # create Out Variable
         out_tensor = scope.var('Out').get_tensor()
 
@@ -64,16 +57,28 @@ class TestLookupSpraseTable(OpTest):
         lookup_table.run(scope, place)
 
         # get result from Out
-        result_array = np.array(out_tensor)
+        result_array1 = np.array(out_tensor)
         # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array[:-2]):
-            assert (row == result_array[idx]).all()
+        assert (result_array1[0] == w_array[0]).all()
+        assert (result_array1[1] == w_array[1]).all()
+        assert (result_array1[2] == w_array[2]).all()
+        assert (result_array1[3] == w_array[1]).all()
+        assert (result_array1[4] == w_array[3]).all()
+        assert (result_array1[5] == w_array[0]).all()
+        assert (result_array1[6] == w_array[4]).all()
+
+        # create and initialize Id Variable
+        ids = scope.var("Ids").get_tensor()
+        ids_array2 = np.array([4, 2, 3, 7, 100000]).astype("int64")
+        ids.set(ids_array2, place)
+        lookup_table.run(scope, place)
 
-        # check the random value
-        hist, prob = output_hist(result_array[-1])
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        result_array2 = np.array(out_tensor)
+        assert (result_array2[0] == w_array[5]).all()
+        assert (result_array2[1] == w_array[1]).all()
+        assert (result_array2[2] == w_array[2]).all()
+        assert (result_array2[3] == w_array[6]).all()
+        assert (result_array2[4] == w_array[7]).all()
 
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index ac25f432dffd544d4b336983ec868f2431a5b91a..4990ee898d81089735f6db4ee4ad6758944e311a 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.compat as cpt
 
 
 class TestLookupTableOp(OpTest):
@@ -71,7 +74,7 @@ class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
         flatten_idx = ids.flatten()
         padding_idx = np.random.choice(flatten_idx, 1)[0]
         self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
-        self.attrs = {'padding_idx': long(padding_idx)}
+        self.attrs = {'padding_idx': cpt.long_type(padding_idx)}
         self.check_output()
 
     def test_check_grad(self):
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
index 966a16dc870c041b9deb140bed57d907cf305fd8..f6bb2ab7a696c40cb61dd5b38ca702b577fe7ea2 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 from test_lrn_op import TestLRNOp
 
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index eaff45cbb2a58798e9d55149510bec72eea370cd..bb91f26bbb53de454a6d037af4c9d96262866ce3 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -34,7 +36,7 @@ class TestLRNOp(OpTest):
         return x + 1
 
     def get_out(self):
-        start = -(self.n - 1) / 2
+        start = -(self.n - 1) // 2
         end = start + self.n
 
         mid = np.empty((self.N, self.C, self.H, self.W)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index 705a24bd8f39a55e0a352944d961f8d33aaf96ff..76a24123fc7d51231bf24a3d1a6930186c94a5db 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
index e343265874f99afcd8201fa997932e2613fffc4c..eaa6b774c4d3e7add555c34f887e86dc847583b2 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
index ed2262da4bc727657c2e65d69cb1922891e17b09..9c3ec45515ffe0a07541fd9cfb7e92b079264071 100644
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -11,6 +11,8 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 import test_lstm_op as LstmTest
diff --git a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
index 97c112487fd9193ab77d18945585a554e5fbcdf8..4a7e952436bd46c92c6256b4ec2d0652cfa38959 100644
--- a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index 852a80261e02f5ed19e7fbe608d490be1f7798a9..b25d40a3a15e259878222ee5482cd842543b63d6 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import decorators
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index cae2c8fa87d9857de8f26cf4962d9370eca66243..abf10437d83268a6a84a1c62399eb02cd3b1d663 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py
index f5ddf72516bf8adb41698d9b2e22c7de74a3fad9..d588b22fe2607a6041359d420ebba757d8a632d6 100644
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -19,7 +21,7 @@ from op_test import OpTest
 
 def maxout_forward_naive(input, groups):
     s0, s1, s2, s3 = input.shape
-    return np.ndarray([s0, s1 / groups, groups, s2, s3], \
+    return np.ndarray([s0, s1 // groups, groups, s2, s3], \
         buffer = input, dtype=input.dtype).max(axis=(2))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py
index 32b4ee184787cd4cda0fd889f67a609141a3cb27..03e94483178e83adad9886cd7df2107581360dd1 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from __future__ import division
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index 15472a8fc4716f218b0eddf17589634c565130b1..ff338f0e0037307e81a92eed804096c9a2a87361 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py
index f9daf83652e18faab0ab31402b9f5889a0beceaf..4cdb5b5d9f7f020c4eb9a3b3a804c074d7ddbb35 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_usage.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py
@@ -34,7 +34,7 @@ def train_simulator(test_batch_size=10):
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
     sgd_optimizer.minimize(avg_cost)
 
-    # Calculate memory usage in current network config 
+    # Calculate memory usage in current network config
     lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
         fluid.default_main_program(), batch_size=test_batch_size)
 
diff --git a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
index f209bdf30faffc0b2c7932b7b10f384d6d61a831..26ce7024117162e8bad403a9d8b8518c27578c83 100644
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
index 54ee85c1a7a539fe9517f32adb35ab99b5ae2a07..4e5cc91268c5df4be3de3c04a82ef65b33cf4d20 100644
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_minus_op.py b/python/paddle/fluid/tests/unittests/test_minus_op.py
index ee32bd49925e266cfd5beb51496355e111a3d0d2..54253b17b967871b03628023c5a9fdb339af1828 100644
--- a/python/paddle/fluid/tests/unittests/test_minus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minus_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py b/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
index 62035efe8ec3809a7672e58697de4304426338d7..02fecfe47ec3fbff085b0a7f24316e5d0f6cd814 100644
--- a/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index c75d3bd276a5b494090c1aa1fea0bb4f2c067173..7137fd0fdb7c503492107da684b95989037eb872 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index bbc782c1bce302df68ab30013f3a7667e51ed479..fca4ffa88b79ebfad009b436d440e86ddceaaed7 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
index cb0ea96ff69ce32b0bb1b49f0318c353aa08d388..09788868ccb926f56c2f622b5caf695670fd17f8 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 7fc9f550440d3d0e1a8182a69f5692b3df0aa258..4fae11e928dc7e066799a8936bada0e252afaa42 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 10cb78a08db0471699bcc0b7323d5346e3af64c7..df0562dcc79cbb960136c19d2b3f243cf2e09782 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -11,6 +11,8 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
+
+from __future__ import print_function
 import unittest
 import numpy as np
 import copy
diff --git a/python/paddle/fluid/tests/unittests/test_multihead_attention.py b/python/paddle/fluid/tests/unittests/test_multihead_attention.py
index 80c3c67967e970a7182c008b6cfd138aff044167..f60da862ac091ca1eefccfe2834201d1c79e2def 100644
--- a/python/paddle/fluid/tests/unittests/test_multihead_attention.py
+++ b/python/paddle/fluid/tests/unittests/test_multihead_attention.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_multiplex_op.py b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
index 03cad8b43bac053f246156b8c57df9fceab20dcd..1567a74808aa37e5e18bbe583cc1d8987b31cd58 100644
--- a/python/paddle/fluid/tests/unittests/test_multiplex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index 7431a142c53a64e58872390776904ce8f781d6a9..0745bd274f73715b6fdec236819b8d89827e1346 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
index d4835dd18405fc7a0d508a780a734922e0abd12c..60dcf195daf61d76a2e6d6f764fa216270804f55 100644
--- a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
index 108a665f37f5cd652ec83f784a56ca52e6b49fe8..22bc45ff1ea0efea0ec766a6a9e819cdd81b0866 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
index 198c68866d399023c51c2a43b588aa8ec49c3c9a..24fdcf8c88417244e981194e63bd77a2fdbd179d 100644
--- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
+++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_nvprof.py b/python/paddle/fluid/tests/unittests/test_nvprof.py
index 226e5e5d1131b1f33cfbbfefec536e6974f85b36..da943d64da6cfc64d121b7373f7c067c1cff731c 100644
--- a/python/paddle/fluid/tests/unittests/test_nvprof.py
+++ b/python/paddle/fluid/tests/unittests/test_nvprof.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import os
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
index 06fccd39ac65ab62ee5618ac19d1a0535b481d06..7afdae804a65b9fb05a521a1b08ce0bfb21d721f 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import math
diff --git a/python/paddle/fluid/tests/unittests/test_op_support_gpu.py b/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
index 5fafb8280e19cca46e5bf687494c07200ca53153..e203fccd03f86077c51e176456c1c313ac14a9ee 100644
--- a/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 
diff --git a/python/paddle/fluid/tests/unittests/test_operator.py b/python/paddle/fluid/tests/unittests/test_operator.py
index 5e418fe6ac2d62948762290a65686207d017275c..544fca8cecd0a2b94a5aec40b9442f86036fd4d2 100644
--- a/python/paddle/fluid/tests/unittests/test_operator.py
+++ b/python/paddle/fluid/tests/unittests/test_operator.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.op as op
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index c098a5a0cb0364f9ec93c95c1ef50912e574b3d9..6d01955993324498de42462b7f85ef6f8e444505 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.core as core
+import paddle.compat as cpt
 
 from paddle.fluid.framework import Program, default_startup_program
 
@@ -29,14 +32,15 @@ class TestOperator(unittest.TestCase):
             self.assertFail()
         except ValueError as v_err:
             self.assertEqual(
-                v_err.message,
+                cpt.get_exception_message(v_err),
                 "`type` to initilized an Operator can not be None.")
         try:
             block.append_op(type="no_such_op")
             self.assertFail()
         except ValueError as a_err:
-            self.assertEqual(a_err.message,
-                             "Operator \"no_such_op\" has not been registered.")
+            self.assertEqual(
+                cpt.get_exception_message(a_err),
+                "Operator \"no_such_op\" has not been registered.")
 
     def test_op_desc_creation(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index 18921d727f94a85b69259c07273f09c3e19390c6..4374d198f2f869afab5fb76fdcb43e3c445f7689 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.framework as framework
diff --git a/python/paddle/fluid/tests/unittests/test_pad_op.py b/python/paddle/fluid/tests/unittests/test_pad_op.py
index 300f3ffcb8d1a6152b1e03f1356582c02bc4b2a3..58e56ca1a4dbdc48765a36e1a64b9a2ec8cf9025 100644
--- a/python/paddle/fluid/tests/unittests/test_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index d17e493c36a2ffcba632f5f85c7a1d2e5066dd1c..6d6917300cb66afcc8a0c509986a0f26be8b1f09 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.conll05 as conll05
 import paddle.fluid as fluid
 import unittest
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index a43f2e7c49c02ce779344da44e640cabbf27986c..372ef748b2e704fd3858c382e048e51448ed3bd5 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.dataset.flowers as flowers
 import math
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 9448d89cd58f4e5cff4bac49821fbc44c5a46246..5b96d641d667eee1aa0c7c6019bf92494f777259 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from parallel_executor_test_base import TestParallelExecutorBase
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -181,7 +183,9 @@ class TestMNIST(TestParallelExecutorBase):
             use_parallel_executor=True)
 
         self.assertAlmostEquals(
-            np.mean(parallel_first_loss), single_first_loss, delta=1e-6)
+            np.mean(parallel_first_loss),
+            single_first_loss,
+            delta=1e-6, )
         self.assertAlmostEquals(
             np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
 
@@ -189,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_simple_fc_parallel_accuracy(True)
         self.check_simple_fc_parallel_accuracy(False)
 
-    def check_batchnorm_fc_convergence(self, use_cuda):
+    def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
@@ -201,11 +205,13 @@ class TestMNIST(TestParallelExecutorBase):
             fc_with_batchnorm,
             feed_dict={"image": img,
                        "label": label},
-            use_cuda=use_cuda)
+            use_cuda=use_cuda,
+            use_fast_executor=use_fast_executor)
 
     def test_batchnorm_fc(self):
-        self.check_batchnorm_fc_convergence(True)
-        self.check_batchnorm_fc_convergence(False)
+        for use_cuda in (False, True):
+            for use_fast_executor in (False, True):
+                self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor)
 
     def test_batchnorm_fc_with_new_strategy(self):
         # FIXME(zcd): close this test temporally.
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index a28428d8dee201ba105e18684c15d4b4582d989f..cc2d692e18430eb48e6e800106eab0c3739d3f53 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import paddle.fluid.layers.ops as ops
 from paddle.fluid.initializer import init_on_cpu
@@ -46,7 +48,7 @@ def squeeze_excitation(input, num_channels, reduction_ratio):
     pool = fluid.layers.reduce_mean(input=reshape, dim=2)
 
     squeeze = fluid.layers.fc(input=pool,
-                              size=num_channels / reduction_ratio,
+                              size=num_channels // reduction_ratio,
                               act='relu')
     excitation = fluid.layers.fc(input=squeeze,
                                  size=num_channels,
@@ -62,7 +64,7 @@ def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
         num_filters=num_filters,
         filter_size=filter_size,
         stride=stride,
-        padding=(filter_size - 1) / 2,
+        padding=(filter_size - 1) // 2,
         groups=groups,
         act=None,
         bias_attr=False)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index fcb5947ff05efd1c48ab9ec129ac9d17255d7020..f5a0ba624698b49e0d323e6f830be23a4148392b 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index 8203d5d1fce0950130ab71db40fb306f73c41bd4..5ad922725a0b692e28552737a99b745ed09ddbd5 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
 import transformer_model
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
index c9617e36778740ce9620c3ad495c64c17277fde1..d7b9af8bac67ef89cc1ae59ccf002c2c488f3436 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_parameter.py b/python/paddle/fluid/tests/unittests/test_parameter.py
index e09865074e8aa9345fd9cc84e1f19eaf0436142f..df42e6cb9a050b76099b4a53fdd08d2852284d1f 100644
--- a/python/paddle/fluid/tests/unittests/test_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_parameter.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 from paddle.fluid.framework import default_main_program
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
index 8aff4e87f67bc61a162f09e982cf0a7a61639257..dfedf8190f75ec26532f281338f076ca0c7d83af 100644
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
index 003ebba18b26198427d9f313596ae85656ac24fa..14d7ed9057d622b136056e1b5bbbe57f9a04d5d7 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 from test_pool2d_op import TestPool2d_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 1cf70311b40bc7648b7462e93f201aa33c77b137..26969bd5230afdac83a943d2dc21094a0972d60a 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -29,11 +31,11 @@ def max_pool2D_forward_naive(x,
     if global_pool == 1:
         ksize = [H, W]
     H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-                                                   paddings[0]) / strides[0] + 1
+             ) // strides[0] + 1 if ceil_mode else (
+                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
     W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
-                                                   paddings[1]) / strides[1] + 1
+             ) // strides[1] + 1 if ceil_mode else (
+                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
         for j in range(W_out):
@@ -57,11 +59,11 @@ def avg_pool2D_forward_naive(x,
     if global_pool == 1:
         ksize = [H, W]
     H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-                                                   paddings[0]) / strides[0] + 1
+             ) // strides[0] + 1 if ceil_mode else (
+                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
     W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
-                                                   paddings[1]) / strides[1] + 1
+             ) // strides[1] + 1 if ceil_mode else (
+                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
         for j in range(W_out):
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 92c64b37921eafd4c90e247a235f2dacea8fea1e..77045c1307baead3711d58ed368dfa5f2acc3699 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -29,14 +31,14 @@ def max_pool3D_forward_naive(x,
     if global_pool == 1:
         ksize = [D, H, W]
     D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-                                                   paddings[0]) / strides[0] + 1
+             ) // strides[0] + 1 if ceil_mode else (
+                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
     H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
-                                                   paddings[1]) / strides[1] + 1
+             ) // strides[1] + 1 if ceil_mode else (
+                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-             ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
-                                                   paddings[2]) / strides[2] + 1
+             ) // strides[2] + 1 if ceil_mode else (
+                 W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     for k in range(D_out):
         d_start = np.max((k * strides[0] - paddings[0], 0))
@@ -63,14 +65,14 @@ def avg_pool3D_forward_naive(x,
     if global_pool == 1:
         ksize = [D, H, W]
     D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-                                                   paddings[0]) / strides[0] + 1
+             ) // strides[0] + 1 if ceil_mode else (
+                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
     H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
-                                                   paddings[1]) / strides[1] + 1
+             ) // strides[1] + 1 if ceil_mode else (
+                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-             ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
-                                                   paddings[2]) / strides[2] + 1
+             ) // strides[2] + 1 if ceil_mode else (
+                 W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     for k in range(D_out):
         d_start = np.max((k * strides[0] - paddings[0], 0))
diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
index e6a9f6f08cf1445c14494506641b0c3502591c37..488ff431d4f2ef76ce0c9486d8c307b4e01b5544 100644
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -24,9 +26,9 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
         ksize = [D, H, W]
         paddings = [0, 0, 0]
 
-    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
-    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
-    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     mask = np.zeros((N, C, D_out, H_out, W_out))
     for k in range(D_out):
@@ -63,8 +65,8 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
         ksize = [H, W]
         paddings = [0, 0]
 
-    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
-    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     mask = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
diff --git a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
index 8c76393bdaccc0b701b409efebf08fac95aa5f1a..afe8d212d6ec218c3799780849c377e46a44bd6c 100644
--- a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
+++ b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import itertools
 import numpy as np
+import six
 from op_test import OpTest
 
 
@@ -32,7 +35,7 @@ def py_pnpair_op(score, label, query, column=-1, weight=None):
 
     # accumulate statistics
     pos, neg, neu = 0, 0, 0
-    for _, ranks in list(predictions.items()):
+    for _, ranks in six.iteritems(predictions):
         for e1, e2 in itertools.combinations(ranks, 2):
             s1, s2, l1, l2, w1, w2 = e1[0], e2[0], e1[1], e2[1], e1[2], e2[2]
             w = (w1 + w2) * 0.5
diff --git a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
index 5ae425fee18b9b1baa0b945782268b79d6bb6625..645637625959f214db3875bc58e4c593c27ae8f6 100644
--- a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
+++ b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index cb7de3fc93c0379ea50c88044876d6a8ee617a69..979be5af3bdc24b1a2fc115198eeab53469a91c0 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_preprocessor.py b/python/paddle/fluid/tests/unittests/test_preprocessor.py
index cbf1a7e0c50a87cd43507ffdb94109873cf4e5d9..98e609b76982650c9d18f87c3c0637056cc40899 100644
--- a/python/paddle/fluid/tests/unittests/test_preprocessor.py
+++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle.dataset.mnist as mnist
 
 
 class TestPreprocessor(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
index b461c5c9401d74ef8dcf4afc84dc0ea6920a2419..ac682d6181cfcc5a064a51a736b03d493c37b780 100644
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 from paddle.fluid.executor import Executor
diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
index e15554737b9f3fa36382dde15ded928271679538..7381b74af71051f8b993ba6d116b5282dd9b84e1 100644
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 9f8d33f9bbfc78b6f1a0c089b34b2f41d561c640..7934164b84931f886967982ce0cb65c406bbf800 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import os
 import numpy as np
@@ -23,9 +25,6 @@ import paddle.fluid.core as core
 
 class TestProfiler(unittest.TestCase):
     def net_profiler(self, state, profile_path='/tmp/profile'):
-        enable_if_gpu = state == 'GPU' or state == "All"
-        if enable_if_gpu and not core.is_compiled_with_cuda():
-            return
         startup_program = fluid.Program()
         main_program = fluid.Program()
 
@@ -79,8 +78,6 @@ class TestProfiler(unittest.TestCase):
                 pass_acc_calculator.add(value=acc, weight=b_size)
                 pass_acc = pass_acc_calculator.eval()
 
-    @unittest.skipIf(not core.is_compiled_with_cuda(),
-                     "profiler is enabled only with GPU")
     def test_cpu_profiler(self):
         self.net_profiler('CPU')
 
@@ -93,7 +90,7 @@ class TestProfiler(unittest.TestCase):
                      "profiler is enabled only with GPU")
     def test_all_profiler(self):
         self.net_profiler('All', '/tmp/profile_out')
-        with open('/tmp/profile_out', 'r') as f:
+        with open('/tmp/profile_out', 'rb') as f:
             self.assertGreater(len(f.read()), 0)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf.py b/python/paddle/fluid/tests/unittests/test_protobuf.py
index c3f1fa80185bfc4afc3ed715d736bcba092629d8..7b80927c48d02e83a9bfaac572c81a6a95a69c8c 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid.proto.framework_pb2 as framework_pb2
 import unittest
 
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index 9853fb4e9a89944bfdf2954e3d3d86fef92ac93c..d24b5cbd06ddf9f332c1369ebd513bef27b77e14 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
+import paddle.compat as cpt
 from paddle.fluid.framework import Program
 
 
@@ -108,7 +111,7 @@ class TestVarDesc(unittest.TestCase):
     def test_shape(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.var('my_var')
+        var = block.var(cpt.to_bytes('my_var'))
         var.set_type(core.VarDesc.VarType.SELECTED_ROWS)
         src_shape = [3, 2, 10, 8]
         var.set_shape(src_shape)
@@ -119,7 +122,7 @@ class TestVarDesc(unittest.TestCase):
     def test_multiple_shape(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.var('my_reader')
+        var = block.var(cpt.to_bytes('my_reader'))
         var.set_type(core.VarDesc.VarType.READER)
         src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]]
         var.set_shapes(src_shapes)
@@ -130,7 +133,7 @@ class TestVarDesc(unittest.TestCase):
     def test_dtype(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.var('my_var')
+        var = block.var(cpt.to_bytes('my_var'))
         var.set_type(core.VarDesc.VarType.LOD_TENSOR)
         var.set_dtype(core.VarDesc.VarType.INT32)
         self.assertEqual(core.VarDesc.VarType.INT32, var.dtype())
@@ -139,7 +142,7 @@ class TestVarDesc(unittest.TestCase):
     def test_multiple_dtype(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.var('my_reader')
+        var = block.var(cpt.to_bytes('my_reader'))
         var.set_type(core.VarDesc.VarType.READER)
         src_types = [
             core.VarDesc.VarType.INT32, core.VarDesc.VarType.FP64,
@@ -152,7 +155,7 @@ class TestVarDesc(unittest.TestCase):
     def test_multiple_lod_level(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.var('my_reader')
+        var = block.var(cpt.to_bytes('my_reader'))
         var.set_type(core.VarDesc.VarType.READER)
         src_types = [3, 1, 2]
         var.set_lod_levels(src_types)
@@ -166,12 +169,12 @@ class TestBlockDesc(unittest.TestCase):
         self.assertIsNotNone(program_desc)
         block = program_desc.block(0)
         self.assertIsNotNone(block)
-        var1 = block.var("var1")
-        var2 = block.var("var2")
-        var3 = block.var("var3")
+        var1 = block.var(cpt.to_bytes("var1"))
+        var2 = block.var(cpt.to_bytes("var2"))
+        var3 = block.var(cpt.to_bytes("var3"))
         all_vars = block.all_vars()
         self.assertEqual(set(all_vars), {var1, var2, var3})
-        var2_re = block.find_var("var2")
+        var2_re = block.find_var(cpt.to_bytes("var2"))
         self.assertEqual(var2_re, var2)
 
     def test_add_op(self):
diff --git a/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
index 3c2689585061af5a11a247a01b87b432dcd86e13..57e96f1fa34fa94f5e095d088016655f24b58d0c 100644
--- a/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py b/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
index 137594b9a08e13bf6c3f3779356c209596f9ba8e..067502baecc73cc84a6aa8ab78a9afbcc191c49a 100644
--- a/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
index f9bda5e4701f693f41fe7041ba0f5ec80b6fc31c..3efe5aac8848b8230f42f4f3905eefc517c0fa5e 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index 9a379bdbaa7e278879117a8cdc2dddb335a10ca1..931cac409f26fce4ecca18c4b0cfcca2e675046f 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_random_crop_op.py b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
index 1c708d0386da4028f1f3d177d0a3fd494c077c6e..f29dddff7a28ed041908741007361224624e436a 100644
--- a/python/paddle/fluid/tests/unittests/test_random_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
@@ -21,11 +23,12 @@ from op_test import OpTest
 class TestRandomCropOp(OpTest):
     def setUp(self):
         to_crop = np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] *
-                           5).astype("float32")
+                           5).astype(np.int32)
         self.possible_res = [
-            np.array([[1, 2, 3], [5, 6, 7]]), np.array([[2, 3, 4], [6, 7, 8]]),
-            np.array([[5, 6, 7], [9, 10, 11]]),
-            np.array([[6, 7, 8], [10, 11, 12]])
+            np.array([[1, 2, 3], [5, 6, 7]]).astype(np.int32),
+            np.array([[2, 3, 4], [6, 7, 8]]).astype(np.int32),
+            np.array([[5, 6, 7], [9, 10, 11]]).astype(np.int32),
+            np.array([[6, 7, 8], [10, 11, 12]]).astype(np.int32)
         ]
         self.op_type = "random_crop"
         self.inputs = {'X': to_crop, 'Seed': np.array([10])}
diff --git a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
index 7eba1e2077e25325d537f01f043ed1afa372800c..c9fa24b103deb50aa896403e09b11e891fb62c6d 100644
--- a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
index 3ad85d57485956e0cadb197dadd172516fa15c39..8ad11d76f683d556f05cafc3251acc942efef72f 100644
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import numpy as np
 import unittest
 
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 69a522e273db017ac55b408276b4a28f5f907c42..c5210bb2085bc386df43cd0d20292d7b308a1093 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle
+import paddle.dataset.mnist as mnist
 
 
 class TestRecordIO(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index 2e22df2beba9d74e28788fb72f6f7f7f2bef534e..6dfc85e301a2eda66bade09a8b6dd0004155f385 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.layers as layers
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 06d116601bf733986ccf9c725340456ab1258be2..328f0f0011381b77cccb8b2d9b266aa53b259473 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_registry.py b/python/paddle/fluid/tests/unittests/test_registry.py
index a361c4624e3e2efa817e8137ff31133997a0a1fb..7381bb61eb4630cb67bc306fde211704e9580af4 100644
--- a/python/paddle/fluid/tests/unittests/test_registry.py
+++ b/python/paddle/fluid/tests/unittests/test_registry.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 import unittest
 
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 9b1c4ceada52322b3f1fdc4ab2e90a2c089ee67e..6727335c6059161d235a64a1b90d36b84004f9b3 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.framework as framework
diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
index 6e1cd56b3e309fc014dc981a1e3aa841159fca15..28c8c4699adbc108c05e4a500815752e2ec24c61 100644
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.layers.control_flow import lod_rank_table
 import numpy
+import functools
 
 
 class TestReorderLoDTensor(unittest.TestCase):
@@ -101,7 +104,8 @@ class TestReorderLoDTensor(unittest.TestCase):
         rank_table = []  # list of (index, length)
         for i in range(len(ref_lod)):
             rank_table.append((i, ref_lod[i]))
-        rank_table = sorted(rank_table, lambda x, y: y[1] - x[1])
+        rank_table = sorted(
+            rank_table, key=functools.cmp_to_key(lambda x, y: y[1] - x[1]))
 
         # compute the input sequence info according to input_lod
         input_value, input_lod = self.data[self.data_desc[0][0]]
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 2f5558578ac2a002a83c2a7e027ec5a96d8b4414..1de35dc35b0176b77eb2d9b25cd6ee4e645e56c3 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py
index f845575a02869f08299d76b5600074598ca27f6c..e83f548c228c7c045ff795e882738ea56e3f2d24 100644
--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index 0d84a5853ead45b84de9383dd8749992d2f91440..3d4623c74d9a307b12ab6d72ad0b4d2dae938720 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py b/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
index 178606f05961263df5ef0398064a1fd135fbe784..9bfec8e9bdd8c4667fb19f3dd479b759d6dd665b 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 from paddle.fluid.framework import Program
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index df5684ab173a4889dd7b693f9246bafd12e0345f..ed7f467835f32242a9650f226b4a5ad9d6d87af4 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import math
 import sys
+import paddle.compat as cpt
 from op_test import OpTest
 
 
@@ -59,10 +62,10 @@ class TestROIPoolOp(OpTest):
         for i in range(self.rois_num):
             roi = self.rois[i]
             roi_batch_id = roi[0]
-            roi_start_w = int(round(roi[1] * self.spatial_scale))
-            roi_start_h = int(round(roi[2] * self.spatial_scale))
-            roi_end_w = int(round(roi[3] * self.spatial_scale))
-            roi_end_h = int(round(roi[4] * self.spatial_scale))
+            roi_start_w = int(cpt.round(roi[1] * self.spatial_scale))
+            roi_start_h = int(cpt.round(roi[2] * self.spatial_scale))
+            roi_end_w = int(cpt.round(roi[3] * self.spatial_scale))
+            roi_end_h = int(cpt.round(roi[4] * self.spatial_scale))
 
             roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
             roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
@@ -97,8 +100,8 @@ class TestROIPoolOp(OpTest):
                             for w in range(wstart, wend):
                                 if x_i[c, h, w] > out_data[i, c, ph, pw]:
                                     out_data[i, c, ph, pw] = x_i[c, h, w]
-                                    argmax_data[i, c, ph, pw] = h * \
-                                        self.width + w
+                                    argmax_data[i, c, ph,
+                                                pw] = h * self.width + w
 
         self.outs = out_data.astype('float32')
         self.argmaxes = argmax_data.astype('int64')
@@ -110,14 +113,14 @@ class TestROIPoolOp(OpTest):
             self.rois_lod[0].append(bno + 1)
             for i in range(bno + 1):
                 x1 = np.random.random_integers(
-                    0, self.width / self.spatial_scale - self.pooled_width)
+                    0, self.width // self.spatial_scale - self.pooled_width)
                 y1 = np.random.random_integers(
-                    0, self.height / self.spatial_scale - self.pooled_height)
+                    0, self.height // self.spatial_scale - self.pooled_height)
 
                 x2 = np.random.random_integers(x1 + self.pooled_width,
-                                               self.width / self.spatial_scale)
-                y2 = np.random.random_integers(y1 + self.pooled_height,
-                                               self.height / self.spatial_scale)
+                                               self.width // self.spatial_scale)
+                y2 = np.random.random_integers(
+                    y1 + self.pooled_height, self.height // self.spatial_scale)
 
                 roi = [bno, x1, y1, x2, y2]
                 rois.append(roi)
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
index 07dcd108689ae6069e30fe22029258d192215549..2f13f067ef313685227c7de9a49fae8640ca6b32 100644
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
index df6e0faaca6fd007b39a8f358d964055e149a025..08c462d9036cacab81dab7c9ea16664c9159479f 100644
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..708265b4576809b1f4157d54989c6138c6e5a2b0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
@@ -0,0 +1,51 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+class TestSamplingIdOp(OpTest):
+    def setUp(self):
+        self.op_type = "sampling_id"
+        self.use_mkldnn = False
+        self.init_kernel_type()
+        self.X = np.random.random((8, 4)).astype('float32')
+        self.inputs = {"X": self.X}
+        self.Y = np.random.random(8).astype('float32')
+        self.outputs = {'Out': self.Y}
+        self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+        y1 = self.out
+        self.check_output_customized(self.verify_output)
+        y2 = self.out
+        self.assertTrue(np.array_equal(y1, y2))
+        self.assertEqual(len(y1), len(self.Y))
+
+    def verify_output(self, outs):
+        out = np.array(outs[0])
+        self.out = out
+
+    def init_kernel_type(self):
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index 53f59c399054a96f5b5f07a390e6fa9eeae878ce..0a8a43253d79ba21c7333dd19af05d8adf410289 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
index fb1728743630b3ea908ae835444eff7fd71b72c8..088996f9d7dee1ea914e36e3342c9a5110001c44 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_scope.py b/python/paddle/fluid/tests/unittests/test_scope.py
index d249a989a9499d01f6ed10d6cdbc6c456a7262c5..45fcbfba6eb7c6fc4e75f6d8228d721c0186ef36 100644
--- a/python/paddle/fluid/tests/unittests/test_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_scope.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid.core
 import unittest
 
diff --git a/python/paddle/fluid/tests/unittests/test_selected_rows.py b/python/paddle/fluid/tests/unittests/test_selected_rows.py
index f504a06ffff8cb636498652554fca05e22bb905d..2f34f79b8eafad8e7fdf6b359548747f354b141f 100644
--- a/python/paddle/fluid/tests/unittests/test_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_selected_rows.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid.core as core
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
index 11ffa761a690eb1f9f6dc50c45128a99301741db..9d1d139721ad7ee3e29d44c9b3e7c666b78a4556 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
index 1a6e1aad799e77b8e746353bee93680691939d24..dcc86382e5286f354c4f2e81ead598f12c75b2c1 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import random
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
index 0b3659d7a67956f7546d368346bd102eeedf1d97..66e77714c5d65d51262f76519901032182985ea8 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
index 8f0765277ae85af2b17ad96d4fd0c1148c393ff0..92cd5b0cbcd1ab56300158d26850969870e86f2b 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 5ff0dab23e516ae8114b8264492fb2a9d5c0b3f8..ffd4026dbade2f8f7eace399c52ae0428f3e8d7b 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
index 39b02ecf6ddb40737c4c1737d652c1a1b744d923..f11fa6c39c35efc14f8600dd746ab64cc940cd71 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import math
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
index 313e485d1e3080f2c59c68256cbc5c81aa6558cd..1561490087330c9af3ea3e384bf735eaa268a749 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
index c4fc8b74cf80c3596b0af9f7f0434864591195bd..3e00e7d95f63ea652ea1964eb792f9393ffa5994 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index 3126293f9d8e52daa866be5fc1533648a33f3363..b46e4bfb86bd5dc9c74375693328f2506281be3e 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import paddle.fluid.core as core
@@ -124,6 +126,7 @@ class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
         w_selected_rows = scope.var('Param').get_selected_rows()
         w_selected_rows.set_height(len(param_rows))
         w_selected_rows.set_rows(param_rows)
+        w_selected_rows.sync_index()
         w_array = np.ones((len(param_rows), row_width)).astype("float32")
         for i in range(len(param_rows)):
             w_array[i] *= i
diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py
index a62ee050075cb8c9f8817c142825a89c24bdfedf..02231ea943e1e92a08730e6e9f1aa3cefeb927c0 100644
--- a/python/paddle/fluid/tests/unittests/test_shape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index a994bf181a74ca71a970da0105fe767f82750a6c..97f79f9421d498723da4c7992551f1210d3f6003 100644
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 from paddle.fluid.executor import Executor
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index c435796569cd2479c19d70a849991f439bf5292a..97ff203499c0bf223930c904de46e1abdd902799 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 from op_test import OpTest
 from scipy.special import logit
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index 087a0c575bfa6bc18cb229ad274b4e1e90210605..85a9d9cae47c2b0942da0e0d962d4512af1566c0 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index 1a48bce3bb7c74551a365fd471f6869b128babac..134df38eea6655857db04dfdc19dd7f7897946f4 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_slice_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py
index 82305b23a1a1e2cee8cef6b291d848581fe5b509..fab63b7d5631829feffd26fc1dce2bd338d2036b 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_var.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import math
 import unittest
 from paddle.fluid.transpiler.distribute_transpiler import slice_variable
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
index e74664dac4d87c5a7e1e061294e93e8267e3cc17..8ab6833821c75262124b3ae4200a17e457b718d5 100644
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 70ad05597c4a160cf6a25aeb3c379320cef69c63..d88aa1ae1c9d848eba7a2224d22b5201fc27b857 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index c0d9fc8f22a7c4f791d80a9cad87d003b5d54299..b7e5ff6d52ad7dde3dd94b3bd660cfca383e1ada 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
index ea1146166d34a31efbd859318b411cea50895fe1..5397d5c52158ccfb9ad5703b957ca59d6fa11418 100644
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
index ca7861309839d183e18c168403881a0b1b5bf309..4c3d0258980fd8595704a65219deb520b96e222e 100644
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
+import six
 from op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
@@ -59,7 +62,7 @@ class TestSpliteIds(unittest.TestCase):
         x_tensor = x.get_tensor()
         x_tensor.set(np_array, place)
 
-        outs_name = ["out%d" % i for i in xrange(3)]
+        outs_name = ["out%d" % i for i in six.moves.xrange(3)]
         outs = [
             scope.var(var_name).get_selected_rows() for var_name in outs_name
         ]
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 6b67a52e81b978ed78c72629f9177759f8e2c4e1..3c5dd782f85235c4a2feb5a8ca6d048a012c5e1c 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
index 2b261820e04b08234477fc0a9adde95262f99bba..41a5ee59ea523b1f6c5015974a12c526e883fa35 100644
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.core as core
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/test_spp_op.py b/python/paddle/fluid/tests/unittests/test_spp_op.py
index 3cbfc2a703f1c4a24674d468cd1152bfa6eb8ad2..a6c2cccd39c9cecb2ae904a1930b44ba18dbbd7e 100644
--- a/python/paddle/fluid/tests/unittests/test_spp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spp_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
index 78bc300ebec1cd34e44343d47376fef05a6d0135..a8bc1004d9bbe91e323db49c0cf0b576f8da306e 100644
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
index 609445d52287f421e67a5796f9e50c1fb42c8e49..439bae9510ee84b131050bb6804a3ede2ad6a8b3 100644
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 import unittest
 from numpy import linalg as LA
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index bca6af2fd5dfadbc48cf1a76cfa6ffd4f1fdfdef..2be8e24a0fae6945351eb767ac924d7ca70848ab 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -39,7 +41,7 @@ class TestSqueezeOp(OpTest):
         self.new_shape = (3, 5)
 
     def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": False}
+        self.attrs = {"axes": self.axes}
 
 
 # Correct: There is mins axis.
@@ -66,49 +68,5 @@ class TestSqueezeOp3(TestSqueezeOp):
         self.new_shape = (3, 5, 1, 4)
 
 
-# Correct: Inplace.
-class TestSqueezeOpInplace1(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = (0, 2)
-        self.new_shape = (3, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. There is mins axis.
-class TestSqueezeOpInplace2(TestSqueezeOp):
-    def inti_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = (0, -2)
-        self.new_shape = (3, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. No axes input.
-class TestSqueezeOpInplace3(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = ()
-        self.new_shape = (3, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inpalce. Just part of axes be squeezed. 
-class TestSqueezeOpInplace4(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 1, 5, 1, 4, 1)
-        self.axes = (1, -1)
-        self.new_shape = (3, 5, 1, 4)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
index 7956897d68a3fb49d62ba696d0b6400b4f909989..55820f31b81df9f3618d1004f6d21565564efa29 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 from test_sum_op import TestSumOp
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 1d90414e137a70e6265042e24e106fe565802778..74797bb65678404b7b35d06eecc7f9a12b2a346e 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 
 
 class TestSumOp(OpTest):
@@ -40,5 +44,66 @@ class TestSumOp(OpTest):
         pass
 
 
+class TestSelectedRowsSumOp(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        self.check_input_and_optput(scope, place, True, True, True)
+        self.check_input_and_optput(scope, place, False, True, True)
+        self.check_input_and_optput(scope, place, False, False, True)
+        self.check_input_and_optput(scope, place, False, False, False)
+
+    def check_input_and_optput(self,
+                               scope,
+                               place,
+                               w1_has_data=False,
+                               w2_has_data=False,
+                               w3_has_data=False):
+
+        self.create_selected_rows(scope, place, "W1", w1_has_data)
+        self.create_selected_rows(scope, place, "W2", w2_has_data)
+        self.create_selected_rows(scope, place, "W3", w3_has_data)
+
+        # create Out Variable
+        out = scope.var('Out').get_selected_rows()
+
+        # create and run sum operator
+        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out='Out')
+        sum_op.run(scope, place)
+
+        has_data_w_num = 0
+        for w in [w1_has_data, w2_has_data, w3_has_data]:
+            if not w:
+                has_data_w_num += 1
+
+        self.assertEqual(7 * has_data_w_num, len(out.rows()))
+
+    def create_selected_rows(self, scope, place, var_name, isEmpty):
+        # create and initialize W Variable
+        if not isEmpty:
+            rows = [0, 1, 2, 3, 4, 5, 6]
+            row_numel = 12
+        else:
+            rows = []
+            row_numel = 12
+
+        var = scope.var(var_name)
+        w_selected_rows = var.get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        return var
+
+    def test_w_is_selected_rows(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_switch.py b/python/paddle/fluid/tests/unittests/test_switch.py
index 528c5cce4bc7262ade196f6a81a57a57089117ec..2a9c07a889ba5fe24fd1c098729a233cb8fbb16f 100644
--- a/python/paddle/fluid/tests/unittests/test_switch.py
+++ b/python/paddle/fluid/tests/unittests/test_switch.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
index bd208897520122b6a5dcf71da325b1b9dba632f6..aec219f80639415a9be55ba18e7940953d0e11b0 100644
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import random
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index 5ccc876ae8e6e20f76c77c1892f4de59d72bffc8..e9d0f8a0193c77da33a8cf128dbf8a1c5087782b 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import paddle.fluid.core as core
 import unittest
 import numpy
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
index cbc3da550306b9febe8a8fd22e7f71efa572a3d0..e54e170f7f1e03db4b63db72edb7395d18130f68 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index ebd63fbd495354eafe298ad5cc3456a196538a6a..0853f80b82030679d140f7fabdd42557c2374599 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
index e033e86114f17d37c01480fe8350648eb8aa27cb..7b8be24d9da8c15eeb52c0ba207ea780b03254f8 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 346a949b6e7c96b5535f5e65ddbada11e110a0a7..d6a5d68765c53d9d711add64c86575a0db6997e4 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_unique_name.py b/python/paddle/fluid/tests/unittests/test_unique_name.py
index 49ef335618ca7ca1e8249a61a97ca552dabdb9e8..b8c751b2e9b5a905d9de40fc5f78a02c6ca5e034 100644
--- a/python/paddle/fluid/tests/unittests/test_unique_name.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_name.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid as fluid
 
diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
index ecce4cdde2d648fe7d65427e34c77f5f9ad61417..b0c7c3c8662e217f4e88245f22f6b50e7a48c8b7 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -27,7 +29,7 @@ def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
             for h in range(s2):
                 for w in range(s3):
                     index = indices[nidx, cidx, h, w]
-                    hidx = (index - index % out_wsize) / out_wsize
+                    hidx = (index - index % out_wsize) // out_wsize
                     widx = index % out_wsize
                     out[nidx, cidx, int(hidx), int(widx)] = \
                             input[nidx, cidx, h, w]
@@ -41,9 +43,9 @@ class TestUnpoolOp(OpTest):
         self.init_test_case()
         pre_input = np.random.random(self.shape).astype("float32")
         nsize, csize, hsize, wsize = pre_input.shape
-        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
+        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) // \
                 self.strides[0] + 1
-        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
+        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) // \
                 self.strides[1] + 1
         input = np.zeros((nsize, csize, hsize_out, wsize_out))
         indices = np.zeros((nsize, csize, hsize_out, wsize_out))
@@ -62,7 +64,7 @@ class TestUnpoolOp(OpTest):
                         input[nidx, cidx, i, j] = x_masked.max()
                         arg = x_masked.argmax()
                         indices[nidx, cidx, i, j] = \
-                                (r_start + arg / self.ksize[1]) * wsize + \
+                                (r_start + arg // self.ksize[1]) * wsize + \
                                 c_start + arg % self.ksize[1]
         output = self.unpool2d_forward_naive(input, indices, self.ksize, \
                 self.strides, self.paddings).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 7a4aa0a40b5eb494f6027e800ca6b466bbe1c302..a324438ba5a3c3b57fd956bd11189ef7d50267e2 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 
@@ -39,7 +41,7 @@ class TestUnsqueezeOp(OpTest):
         self.new_shape = (3, 1, 1, 5)
 
     def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": False}
+        self.attrs = {"axes": self.axes}
 
 
 # Correct: Single input index.
@@ -74,38 +76,5 @@ class TestUnsqueezeOp4(TestUnsqueezeOp):
         self.new_shape = (3, 1, 1, 2, 5, 1)
 
 
-# Correct: Inplace.
-class TestUnsqueezeOpInplace1(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (0, 2)
-        self.new_shape = (1, 3, 1, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. There is mins index.
-class TestUnsqueezeOpInplace2(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (0, -2)
-        self.new_shape = (1, 3, 1, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. There is duplicated axis.
-class TestUnsqueezeOpInplace3(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 2, 5)
-        self.axes = (0, 3, 3)
-        self.new_shape = (1, 3, 2, 1, 1, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 49784e21c461bacadd404bf4a8640ebc4dcb26ca..b0830e130dd9a9037f8dd900a256eea3d05f64b8 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_version.py b/python/paddle/fluid/tests/unittests/test_version.py
index a09c8a759b9461edcf7d5ddbd62d74408d5f292e..42a0e5c802c53ed0e6aad38fb9ab0f64122e87f5 100644
--- a/python/paddle/fluid/tests/unittests/test_version.py
+++ b/python/paddle/fluid/tests/unittests/test_version.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import re
 
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 9f1aaee472f918da7deb8816a0a4654dafe74a30..5e3aa13546d0c4fdcde4a3d6378d5a1748327814 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import sys
 import unittest
 import numpy as np
@@ -132,7 +134,7 @@ class CTCForward(object):
             for k in range(end - start):
                 j = k + start
                 if j & 1 == 1:
-                    label_idx = j / 2
+                    label_idx = j // 2
                     label_val = labels_a_sequence[label_idx, 0]
                     fv = self.log_add(forward_vars[i - 1, j],
                                       forward_vars[i - 1, j - 1])
diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
index 436f9b9f86fb86270e47c8e30c5c0701787ca0f1..e990d8b2498f6a1b62f7a34d329e3ca72a962728 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy
 import collections
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index 790e6afe5f02236b00d9c67b7b25a881e07abace..b75373cf24a7344bf59b3c6fcb9c4c3969be6503 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import paddle.fluid.layers as layers
 from paddle.fluid.executor import Executor
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index c6e176ca31c57e623addd9594be81c0abdce489b..31ae25f02c6fab0d68e83a131ace054d4eb56c12 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import numpy as np
 
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index 868a0248be6833d0e8fed8a26549352562c279c1..f0e74aff6bdfa7d9f0a7f10e64cac4de88009f0a 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from functools import partial
 import numpy as np
 
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index eed9b49ef40b591d5b6481846dab714423f57990..d094647afe1900809fc32cae93f777765f72c675 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import contextlib
 import os
 import errno
 import shutil
+import six
 import time
 
 from . import core
@@ -282,11 +285,12 @@ class Trainer(object):
             self._load_checkpoint()
 
         if param_path and os.path.isdir(param_path):
-            # load params from param_path into scope
-            io.load_persistables(
-                executor=exe,
-                dirname=param_path,
-                main_program=self.startup_program)
+            with self._prog_and_scope_guard():
+                # load params from param_path into scope
+                io.load_persistables(
+                    executor=exe,
+                    dirname=param_path,
+                    main_program=self.startup_program)
 
     def _transpile_nccl2_dist(self):
         # PADDLE_TRAINER_IPS
@@ -618,7 +622,7 @@ def build_feed_var_list(program, feed_order):
                 "The values of 'feed_order' should be a permutation of [0, len(feed_order))"
             )
         sorted_pair_list = sorted(
-            list(feed_order.items()), key=lambda item: item[1])
+            six.iteritems(feed_order), key=lambda item: item[1])
         feed_var_list = [
             program.global_block().var(pair[0]) for pair in sorted_pair_list
         ]
@@ -1036,7 +1040,7 @@ def _save_trainer_args(dirname, trainer_id, trainer_args):
 
     cur_dir = _get_trainer_dir(dirname, trainer_id)
 
-    for name, value in list(trainer_args.items()):
+    for name, value in six.iteritems(trainer_args):
         args_file = os.path.join(cur_dir, name)
         with open(args_file, 'w') as f:
             f.write(str(value))
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
index a8622ad54433fff40f68520955f0294e2955577e..8429e2fd7c5141f064c66d8f406889bca1510fe2 100644
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from .distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
 from .inference_transpiler import InferenceTranspiler
 from .memory_optimization_transpiler import memory_optimize, release_memory
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
index 1bfab1f219f8a2f08a0fb5c0042d87a3ad707dd5..5e98266a761c7e01bd6668e85e6adeb54103ca80 100644
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -12,5 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from .program_utils import *
 from .ufind import *
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index 76d10777f5f9ed6d27d55a640108bd036d8d8bac..640dbf4bbed58edf746456419af18c75241fa03c 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -12,12 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
+import six
+
 
 def delete_ops(block, ops):
     try:
         start = list(block.ops).index(ops[0])
         end = list(block.ops).index(ops[-1])
-        [block._remove_op(start) for _ in range(end - start + 1)]
+        [block._remove_op(start) for _ in six.moves.range(end - start + 1)]
     except Exception as e:
         raise e
     block.program._sync_with_cpp()
diff --git a/python/paddle/fluid/transpiler/details/ufind.py b/python/paddle/fluid/transpiler/details/ufind.py
index 0e30d0e3f9c5712c494daf17b2b4bcec86f69c23..aa63af7dcf7ac85031fb00ca4c39fb36d7e588b8 100644
--- a/python/paddle/fluid/transpiler/details/ufind.py
+++ b/python/paddle/fluid/transpiler/details/ufind.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 
 class UnionFind(object):
     """ Union-find data structure.
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index ce4709f23b752cc061f3b767a262f82378b86707..540eb8c8339981dd727a001c048358895e7b951e 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
 """
 Steps to transpile trainer:
 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
@@ -31,6 +33,8 @@ Steps to transpile pserver:
 import math
 import random
 import numpy as np
+import collections
+import six
 
 from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
@@ -207,7 +211,17 @@ class DistributeTranspiler(object):
 
         ps_dispatcher = self.config.split_method(self.pserver_endpoints)
         self.has_distributed_lookup_table = self._has_distributed_lookup_table()
+        self.param_name_to_grad_name = dict()
+        for param_var, grad_var in self.params_grads:
+            self.param_name_to_grad_name[param_var.name] = grad_var.name
+
+        # add distributed attrs to program
+        self.origin_program._is_distributed = True
+        self.origin_program._endpoints = self.pserver_endpoints
+        self.origin_program._is_chief = self.trainer_id == 0
+        self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None
 
+        # split and create vars, then put splited vars in dicts for later use.
         # step 1: split and create vars, then put splited vars in dicts for later use.
         self._init_splited_vars()
 
@@ -220,39 +234,45 @@ class DistributeTranspiler(object):
         #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
         #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
         # shuffle the map will avoid the uneven distribution above
-        grad_var_mapping_items = list(self.grad_var_mapping.items())
+        grad_var_mapping_items = list(six.iteritems(self.grad_var_mapping))
+
         if not self.config.slice_var_up:
-            random.seed(self.trainer_num)
+            random.seed(self.origin_program.random_seed)
             random.shuffle(grad_var_mapping_items)
 
-        for orig_varname, splited_vars in grad_var_mapping_items:
+        grad_name_to_send_dummy_out = dict()
+        for grad_varname, splited_vars in grad_var_mapping_items:
             eplist = ps_dispatcher.dispatch(splited_vars)
 
             if not self.config.slice_var_up:
                 assert (len(splited_vars) == 1)
 
+            splited_grad_varname = grad_varname
             if len(splited_vars) == 1:
-                orig_varname = splited_vars[0].name
+                splited_grad_varname = splited_vars[0].name
                 index = find_op_by_output_arg(program.global_block(),
-                                              orig_varname)
+                                              splited_grad_varname)
             elif len(splited_vars) > 1:
-                orig_var = program.global_block().vars[orig_varname]
+                orig_var = program.global_block().vars[splited_grad_varname]
                 index = find_op_by_output_arg(program.global_block(),
-                                              orig_varname)
+                                              splited_grad_varname)
                 self._insert_split_op(program, orig_var, index, splited_vars)
                 index += 1
             else:
                 AssertionError("Can not insert the send op by original "
-                               "variable name :", orig_varname)
+                               "variable name :", splited_grad_varname)
 
+            dummy_output = program.global_block().create_var()
+            grad_name_to_send_dummy_out[grad_varname] = dummy_output
             program.global_block()._insert_op(
                 index=index + 1,
                 type="send",
                 inputs={"X": splited_vars},
-                outputs={},
+                outputs={"Out": dummy_output},
                 attrs={
                     "epmap": eplist,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                    "sync_mode": not self.sync_mode,
                 })
             for _, var in enumerate(splited_vars):
                 send_vars.append(var)
@@ -264,7 +284,6 @@ class DistributeTranspiler(object):
                 outputs={},
                 attrs={
                     "endpoints": pserver_endpoints,
-                    "sync_mode": self.sync_mode,
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
@@ -280,19 +299,21 @@ class DistributeTranspiler(object):
             self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
 
         # step4: Concat the parameters splits together after recv.
-        for varname, splited_var in list(self.param_var_mapping.items()):
+        for param_varname, splited_var in six.iteritems(self.param_var_mapping):
             eps = []
             for var in splited_var:
                 index = [v.name for v in recv_vars].index(var.name)
                 eps.append(eplist[index])
-
+            grad_send_dummy_out = grad_name_to_send_dummy_out[
+                self.param_name_to_grad_name[param_varname]]
             program.global_block().append_op(
                 type="recv",
-                inputs={},
+                inputs={"X": [grad_send_dummy_out]},
                 outputs={"Out": splited_var},
                 attrs={
                     "epmap": eps,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                    "sync_mode": not self.sync_mode
                 })
 
         if self.sync_mode:
@@ -305,10 +326,10 @@ class DistributeTranspiler(object):
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
-        for varname, splited_var in list(self.param_var_mapping.items()):
+        for param_varname, splited_var in six.iteritems(self.param_var_mapping):
             if len(splited_var) <= 1:
                 continue
-            orig_param = program.global_block().vars[varname]
+            orig_param = program.global_block().vars[param_varname]
             program.global_block().append_op(
                 type="concat",
                 inputs={"X": splited_var},
@@ -355,7 +376,7 @@ class DistributeTranspiler(object):
         # FIXME(gongwb): delete not need ops.
         # note that: some parameter is not trainable and those ops can't be deleted.
 
-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in six.iteritems(self.param_var_mapping):
             # Get the eplist of recv vars
             eps = []
             for var in splited_var:
@@ -376,7 +397,7 @@ class DistributeTranspiler(object):
 
             op = startup_program.global_block().append_op(
                 type="recv",
-                inputs={},
+                inputs={"X": []},
                 outputs={"Out": splited_var},
                 attrs={
                     "epmap": eps,
@@ -392,7 +413,7 @@ class DistributeTranspiler(object):
                 RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
             })
 
-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in six.iteritems(self.param_var_mapping):
             #add concat ops to merge splited parameters received from parameter servers.
             if len(splited_var) <= 1:
                 continue
@@ -576,6 +597,8 @@ class DistributeTranspiler(object):
             checkpoint_block_id = self._create_checkpoint_save_block(
                 pserver_program, table_opt_block.idx)
 
+            pserver_program._distributed_lookup_table = self.table_name
+
         # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
         # not be executed, so it's safe to use optimize_block to hold the place
         if self.has_distributed_lookup_table:
@@ -602,6 +625,10 @@ class DistributeTranspiler(object):
             outputs={},
             attrs=attrs)
 
+        # add distributed attrs
+        pserver_program._slice_vars_and_attrs = self._get_slice_vars_and_attrs(
+            endpoint)
+
         pserver_program._sync_with_cpp()
         return pserver_program
 
@@ -641,14 +668,14 @@ class DistributeTranspiler(object):
 
         # 1. create vars in pserver program to startup program
         pserver_vars = pserver_program.global_block().vars
-        created_var_map = dict()
-        for _, var in list(pserver_vars.items()):
+        created_var_map = collections.OrderedDict()
+        for _, var in six.iteritems(pserver_vars):
             tmpvar = s_prog.global_block()._clone_variable(var)
             created_var_map[var.name] = tmpvar
 
         # 2. rename op outputs
         for op in orig_s_prog.global_block().ops:
-            new_outputs = dict()
+            new_outputs = collections.OrderedDict()
             # do not append startup op if var is not on this pserver
             op_on_pserver = False
             # TODO(gongwb): remove this line.
@@ -675,8 +702,31 @@ class DistributeTranspiler(object):
                     inputs=new_inputs,
                     outputs=new_outputs,
                     attrs=op.all_attrs())
+
+        # add slice vars
+        s_prog._slice_vars_and_attrs = self._get_slice_vars_and_attrs(endpoint)
+
         return s_prog
 
+    def _get_slice_vars_and_attrs(self, endpoint):
+        slice_vars_and_attrs = []
+        block_suffix = "block"
+        for param in self.param_grad_ep_mapping[endpoint]["params"]:
+            orig_var_name, block_name, _ = self._get_varname_parts(param.name)
+            if not block_name:
+                continue
+
+            block_idx = int(block_name.split(block_suffix)[1])
+            orig_var = self.origin_program.global_block().vars[orig_var_name]
+
+            skip_numel = 0
+            slice_vars = self.param_var_mapping[orig_var_name]
+            for slice_var in slice_vars[:block_idx]:
+                skip_numel += reduce(lambda x, y: x * y, slice_var.shape)
+            slice_vars_and_attrs.append([orig_var, skip_numel, param])
+
+        return slice_vars_and_attrs
+
     # ====================== private transpiler functions =====================
 
     def _has_distributed_lookup_table(self):
@@ -782,22 +832,24 @@ class DistributeTranspiler(object):
                                           self.config.min_block_size)
         assert (len(grad_blocks) == len(param_blocks))
 
-        # origin_varname -> [splited_var]
+        # origin_param_name -> [splited_param_vars]
         self.param_var_mapping = self._create_vars_from_blocklist(
             self.origin_program, param_blocks)
+        # origin_grad_name -> [splited_grad_vars]
         self.grad_var_mapping = self._create_vars_from_blocklist(
             self.origin_program,
             grad_blocks,
             add_trainer_suffix=self.trainer_num > 1)
-        self.grad_param_mapping = dict()
+        # dict(grad_splited_var -> param_splited_var)
+        self.grad_param_mapping = collections.OrderedDict()
         for g, p in zip(grad_blocks, param_blocks):
             g_name, g_bid, _ = g.split(":")
             p_name, p_bid, _ = p.split(":")
             self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
-                    self.param_var_mapping[p_name][int(p_bid)]
+                self.param_var_mapping[p_name][int(p_bid)]
 
         # create mapping of endpoint -> split var to create pserver side program
-        self.param_grad_ep_mapping = dict()
+        self.param_grad_ep_mapping = collections.OrderedDict()
         [
             self.param_grad_ep_mapping.update({
                 ep: {
@@ -915,7 +967,7 @@ class DistributeTranspiler(object):
                     index=op_index + 2,
                     type="send",
                     inputs={'X': self.trainer_side_table_grad_list},
-                    outputs={},
+                    outputs={'Out': []},
                     attrs={
                         "sync_mode": True,
                         "epmap": pserver_endpoints,
@@ -1072,21 +1124,21 @@ class DistributeTranspiler(object):
             block_list (list[(varname, block_id, block_size)]): List of gradient blocks.
             add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True.
         Returns:
-            var_mapping (dict(varname->[new_varname_variable])):A dict mapping
+            var_mapping (collections.OrderedDict(varname->[new_varname_variable])):A dict mapping
                 from original var name to each var split.
         """
 
         # varname->[(block_id, current_block_size)]
-        block_map = dict()
+        block_map = collections.OrderedDict()
 
-        var_mapping = dict()
+        var_mapping = collections.OrderedDict()
         for block_str in block_list:
             varname, offset, size = block_str.split(":")
             if varname not in block_map:
                 block_map[varname] = []
             block_map[varname].append((int(offset), int(size)))
 
-        for varname, splited in list(block_map.items()):
+        for varname, splited in six.iteritems(block_map):
             orig_var = program.global_block().var(varname)
             if len(splited) == 1:
                 if self.sync_mode and add_trainer_suffix:
@@ -1107,7 +1159,7 @@ class DistributeTranspiler(object):
 
             for i, block in enumerate(splited):
                 size = block[1]
-                rows = size / orig_dim1_flatten
+                rows = size // orig_dim1_flatten
                 splited_shape = [rows]
                 if len(orig_shape) >= 2:
                     splited_shape.extend(orig_shape[1:])
@@ -1193,8 +1245,8 @@ class DistributeTranspiler(object):
         elif op_type == "momentum":
             if varkey == "Velocity":
                 return param_shape
-        elif op_type == "":
-            if varkey == "Moment":
+        elif op_type == "rmsprop":
+            if varkey in ["Moment", "MeanSquare"]:
                 return param_shape
         elif op_type == "sgd":
             pass
@@ -1271,10 +1323,8 @@ class DistributeTranspiler(object):
                             grad_to_block_id, origin_program, merged_var):
         program = optimize_block.program
         pserver_block = program.global_block()
-        new_inputs = dict()
+        new_inputs = collections.OrderedDict()
 
-        # update param/grad shape first, then other inputs like
-        # moment can use the updated shape
         def _get_param_block(opt_op):
             # param is already created on global program
             param_block = None
@@ -1287,22 +1337,6 @@ class DistributeTranspiler(object):
         for key in opt_op.input_names:
             if key == "Grad":
                 new_inputs[key] = merged_var
-            # For RMSProp optimizer
-            elif key == "Moment" or key == "MeanSquare":
-                param_block = _get_param_block(opt_op)
-                if not param_block:
-                    return
-                moment_var = origin_program.global_block().vars[opt_op.input(
-                    key)[0]]
-                tmpvar = pserver_block.create_var(
-                    name=moment_var.name,
-                    persistable=moment_var.persistable,
-                    dtype=moment_var.dtype,
-                    # change to use same shape as param
-                    # TODO(typhoonzero): didn't append .block in the var name,
-                    # may affect checkpoint saving? Need to verify.
-                    shape=param_block.shape)
-                new_inputs[key] = tmpvar
             elif key == "Param":
                 param_block = _get_param_block(opt_op)
                 if not param_block:
@@ -1330,7 +1364,7 @@ class DistributeTranspiler(object):
 
         for key in opt_op.input_names:
             new_shape = None
-            if key in ["Param", "Grad", "LearningRate", "Moment", "MeanSquare"]:
+            if key in ["Param", "Grad", "LearningRate"]:
                 continue
             var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
             # update accumulator variable shape
@@ -1357,9 +1391,7 @@ class DistributeTranspiler(object):
 
     def _is_splited_grad_var(self, var, var_dict):
         grad_block = None
-        # TODO(minqiyang): replace these items() with six.iteritems() to
-        # improve memory
-        for _, g in list(var_dict.items()):
+        for _, g in six.iteritems(var_dict):
             if self._orig_varname(g.name) == self._orig_varname(var.name):
                 if g.name.find(".trainer_") == -1:
                     grad_block = g
@@ -1369,7 +1401,7 @@ class DistributeTranspiler(object):
     def _clone_lr_op(self, program, block, op):
         inputs = self._get_input_map_from_op(
             self.origin_program.global_block().vars, op)
-        for key, varlist in list(inputs.items()):
+        for key, varlist in six.iteritems(inputs):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1378,7 +1410,7 @@ class DistributeTranspiler(object):
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, op)
-        for key, varlist in list(outputs.items()):
+        for key, varlist in six.iteritems(outputs):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1393,7 +1425,7 @@ class DistributeTranspiler(object):
         # Append the ops for parameters that do not need to be optimized/updated
         inputs = self._get_input_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-        for key, varlist in list(inputs.items()):
+        for key, varlist in six.iteritems(inputs):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1412,7 +1444,7 @@ class DistributeTranspiler(object):
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-        for key, varlist in list(outputs.items()):
+        for key, varlist in six.iteritems(outputs):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1470,7 +1502,7 @@ class DistributeTranspiler(object):
 
     def _get_input_map_from_op(self, varmap, op):
         """Returns a dict from op input name to the vars in varmap."""
-        iomap = dict()
+        iomap = collections.OrderedDict()
         for key in op.input_names:
             vars = []
             for varname in op.input(key):
@@ -1483,7 +1515,7 @@ class DistributeTranspiler(object):
 
     def _get_output_map_from_op(self, varmap, op):
         """Returns a dict from op output name to the vars in varmap."""
-        iomap = dict()
+        iomap = collections.OrderedDict()
         for key in op.output_names:
             vars = []
             for varname in op.output(key):
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index 87f20bbccf3138585841952efacef5b0a3cbbace..f79fcb24bb5a48de00d03fc468b6526e48656d07 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import os
 import numpy as np
 from .. import core
@@ -57,8 +59,12 @@ class InferenceTranspiler(object):
             scope = global_scope()
         if not isinstance(scope, core.Scope):
             raise TypeError("scope should be as Scope type or None")
-        self._fuse_batch_norm(program, place, scope)
-        self._fuse_relu_mkldnn(program)
+        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
+        if use_mkldnn:
+            self._fuse_relu_mkldnn(program)
+            self._fuse_conv_bias_mkldnn(program)
+        else:
+            self._fuse_batch_norm(program, place, scope)
 
     def _fuse_relu_mkldnn(self, program):
         '''
@@ -80,10 +86,6 @@ class InferenceTranspiler(object):
         :param program: program to transpile
         :type program: Program
         '''
-        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
-        if not use_mkldnn:
-            return
-
         self.block = program.block(0)
 
         i = 0
@@ -104,6 +106,69 @@ class InferenceTranspiler(object):
         # And a better solution will be considered later.
         program = program.clone()
 
+    def _fuse_conv_bias_mkldnn(self, program):
+        '''
+        Transpile the program by fused convolution and elementwise_add.
+
+        Replace conv2d and elementwise_add ops with a new conv2d op
+        based on an old conv2d op and the :math:`Bias` taken from
+        elementwise_add.
+
+        For input :math:`X`:
+
+        - Conv process:            :math:`X = input * W`
+        - Elementwise_add process: :math` X = X + bias`
+
+        After fuse into one operation:
+
+        .. math::
+
+            X = input * W + bias
+
+        The operator transformation is:
+
+        - before:
+
+          - conv->elementwise_add->any_other_op
+
+        - after:
+
+          - conv->any_other_op
+
+        The transpile stages are:
+
+        1. Extract bias and output variables from elementwise_add.
+        2. Extract Input, Weight and attributes from conv op.
+        3. Create a new convolution op based on extracted params.
+        4. Remove old conv op.
+        5. Remove elementwise_add.
+        5. Remove unused variables.
+
+        Args:
+            program (Program): program to transpile
+
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops) - 2:
+            current_op = self.block.ops[i]
+            next_op = self.block.ops[i + 1]
+            # conv2d with bias
+            if current_op.type in ['conv2d'] and \
+               next_op.type in ['elementwise_add']:
+                self._fuse_conv_bias(i, current_op, next_op)
+                self.block._remove_op(i + 1)  # Remove old conv
+                self.block._remove_op(i + 1)  # Remove elementwise_add
+                i = i + 1
+            i = i + 1
+
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
     def _fuse_batch_norm(self, program, place, scope):
         '''
         Transpile the program by fused batch normalization.
@@ -183,7 +248,6 @@ class InferenceTranspiler(object):
                         self.block._remove_op(i + 2)
                         i = i + 1
             i = i + 1
-
         self._adjust_input()
         self._remove_unused_var()
         # TODO(luotao): use clone() method to flush the program.desc in force,
@@ -286,6 +350,33 @@ class InferenceTranspiler(object):
         # collect the renamed input
         self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
 
+    def _fuse_conv_bias(self, index, conv_op, elementwise_add_op):
+        '''
+        fuse the conv op with elementwise_add
+
+        :param index: index of the conv_op in ops list
+        :type index: Int
+        :param conv_op: convolution operator
+        :type conv_op: Operator
+        :param elementwise_add_op: convolution's bias operator
+        :type elementwise_add_op: Operator
+        '''
+
+        bias_var = self.block.var(elementwise_add_op.input("Y")[0])
+        out_var = self.block.var(elementwise_add_op.output("Out")[0])
+        filter_var = self.block.var(conv_op.input("Filter")[0])
+        in_var = self.block.var(conv_op.input("Input")[0])
+        attrs = {name: conv_op.attr(name) for name in conv_op.attr_names}
+
+        self.block._insert_op(
+            index,
+            type="conv2d",
+            inputs={"Input": in_var,
+                    "Filter": filter_var,
+                    "Bias": bias_var},
+            outputs={"Output": out_var},
+            attrs=attrs)
+
     def _adjust_input(self):
         for i in range(len(self.block.ops)):
             current_op = self.block.ops[i]
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 20ba7ed2b0b9df0d0432727ee1f69f61533c402e..3e58e125de4188144646236f7999c620cd8e9459 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 from collections import defaultdict
 from .. import core
+from ... import compat as cpt
 from ..framework import Program, default_main_program, Parameter
 from ..backward import _rename_arg_
 from functools import reduce
@@ -125,15 +128,15 @@ class ControlFlowGraph(object):
 
     def _has_var(self, block_desc, var_name, is_forward):
         if is_forward:
-            return block_desc.has_var(str(var_name))
+            return block_desc.has_var(cpt.to_bytes(var_name))
         else:
-            return block_desc.has_var_recursive(str(var_name))
+            return block_desc.has_var_recursive(cpt.to_bytes(var_name))
 
     def _find_var(self, block_desc, var_name, is_forward):
         if is_forward:
-            return block_desc.find_var(str(var_name))
+            return block_desc.find_var(cpt.to_bytes(var_name))
         else:
-            return block_desc.find_var_recursive(str(var_name))
+            return block_desc.find_var_recursive(cpt.to_bytes(var_name))
 
     def _check_var_validity(self, block_desc, x, is_forward):
         if str(x) == "@EMPTY@":
@@ -258,7 +261,7 @@ class ControlFlowGraph(object):
                         # Rename the var to the cache var already with
                         # memory allocated in order to reuse the memory.
                         _rename_arg_(self._ops, x, cache_var, begin_idx=i)
-                        self._program.block(block_desc.id).var(str(
+                        self._program.block(block_desc.id).var(cpt.to_text(
                             x)).desc = self._find_var(block_desc, cache_var,
                                                       is_forward)
                         self._update_graph(x, cache_var, begin_idx=i)
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
index dcffadd531719431f27feb464ed58a65c04770ee..6a6d14a69ba771e192a28951a6df7027741a655a 100644
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 
 class PSDispatcher(object):
     """
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index b125eba4f83c588fa2fa81a357604a7d8592ea80..b9957a699e597898bee75ce0e7283f7224293f0c 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import collections
 import contextlib
 import six
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index ce410e61b92e7d3f32fa5bfeb415e4b6c5fa9df6..6d7ac876fdf65fe0e85cceb94c311a93d9ea39c2 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -27,6 +27,7 @@ from six.moves import zip
 import itertools
 import random
 import zlib
+import paddle.compat as cpt
 
 
 def map_readers(func, *readers):
@@ -390,9 +391,9 @@ class PipeReader:
             buff = self.process.stdout.read(self.bufsize)
             if buff:
                 if self.file_type == "gzip":
-                    decomp_buff = self.dec.decompress(buff)
+                    decomp_buff = cpt.to_text(self.dec.decompress(buff))
                 elif self.file_type == "plain":
-                    decomp_buff = buff
+                    decomp_buff = cpt.to_text(buff)
                 else:
                     raise TypeError("file_type %s is not allowed" %
                                     self.file_type)
diff --git a/python/paddle/reader/tests/creator_test.py b/python/paddle/reader/tests/creator_test.py
index c4238c12a74759d52eb09f31ce1126cc93dd3489..d7107610a5dd751cad8f8365aec32c6ba92c53ae 100644
--- a/python/paddle/reader/tests/creator_test.py
+++ b/python/paddle/reader/tests/creator_test.py
@@ -29,6 +29,7 @@ import os
 import unittest
 import numpy as np
 import paddle.reader.creator
+import six
 
 
 class TestNumpyArray(unittest.TestCase):
@@ -37,7 +38,7 @@ class TestNumpyArray(unittest.TestCase):
         x = np.array(l, np.int32)
         reader = paddle.reader.creator.np_array(x)
         for idx, e in enumerate(reader()):
-            self.assertItemsEqual(e, l[idx])
+            six.assertCountEqual(self, e, l[idx])
 
 
 class TestTextFile(unittest.TestCase):
diff --git a/python/paddle/utils/dump_config.py b/python/paddle/utils/dump_config.py
index d27af7f76246a4c9db9a43c17715506d82031b9c..6a96a0a78fc77c50904ee7822c725c41e646c5e6 100644
--- a/python/paddle/utils/dump_config.py
+++ b/python/paddle/utils/dump_config.py
@@ -37,9 +37,9 @@ if __name__ == '__main__':
     assert isinstance(conf, TrainerConfig_pb2.TrainerConfig)
 
     if whole_conf:
-        print conf
+        print(conf)
     else:
         if binary:
             sys.stdout.write(conf.model_config.SerializeToString())
         else:
-            print conf.model_config
+            print(conf.model_config)
diff --git a/python/paddle/utils/image_multiproc.py b/python/paddle/utils/image_multiproc.py
index 3e3e519f76d388eeb477f0014bcbb3e7cd09352a..d1bbda3fd3562efe486377d41a9fb7359bafa4e7 100644
--- a/python/paddle/utils/image_multiproc.py
+++ b/python/paddle/utils/image_multiproc.py
@@ -15,7 +15,8 @@
 import os, sys
 import numpy as np
 from PIL import Image
-from cStringIO import StringIO
+import six
+from six.moves import cStringIO as StringIO
 import multiprocessing
 import functools
 import itertools
@@ -187,7 +188,8 @@ class PILTransformer(ImageTransformer):
         return self.transform(im)
 
 
-def job(is_img_string, transformer, (data, label)):
+def job(is_img_string, transformer, data_label_pack):
+    (data, label) = data_label_pack
     if is_img_string:
         return transformer.transform_from_string(data), label
     else:
@@ -208,7 +210,7 @@ class MultiProcessImageTransformer(object):
         """
         Processing image with multi-process. If it is used in PyDataProvider,
         the simple usage for CNN is as follows:
-       
+
         .. code-block:: python
 
             def hool(settings, is_train,  **kwargs):
@@ -229,7 +231,7 @@ class MultiProcessImageTransformer(object):
             @provider(init_hook=hook, pool_size=20480)
             def process(settings, file_list):
                 with open(file_list, 'r') as fdata:
-                    for line in fdata: 
+                    for line in fdata:
                         data_dic = np.load(line.strip()) # load the data batch pickled by Pickle.
                         data = data_dic['data']
                         labels = data_dic['label']
@@ -249,10 +251,10 @@ class MultiProcessImageTransformer(object):
         :type channel_swap: tuple or list
         :param mean: the mean values of image, per-channel mean or element-wise mean.
         :type mean: array, The dimension is 1 for per-channel mean.
-                    The dimension is 3 for element-wise mean. 
+                    The dimension is 3 for element-wise mean.
         :param is_train: training peroid or testing peroid.
         :type is_train: bool.
-        :param is_color: the image is color or gray. 
+        :param is_color: the image is color or gray.
         :type is_color: bool.
         :param is_img_string: The input can be the file name of image or image string.
         :type is_img_string: bool.
@@ -273,4 +275,4 @@ class MultiProcessImageTransformer(object):
     def run(self, data, label):
         fun = functools.partial(job, self.is_img_string, self.transformer)
         return self.pool.imap_unordered(
-            fun, itertools.izip(data, label), chunksize=100 * self.procnum)
+            fun, six.moves.zip(data, label), chunksize=100 * self.procnum)
diff --git a/python/paddle/utils/image_util.py b/python/paddle/utils/image_util.py
index d3d79b14405256bbc95c41d805dbee56cb104f5e..a8092349cde8a4cb30873bf819fd5ed96289a945 100644
--- a/python/paddle/utils/image_util.py
+++ b/python/paddle/utils/image_util.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 from PIL import Image
-from cStringIO import StringIO
+from six.moves import cStringIO as StringIO
 
 
 def resize_image(img, target_size):
@@ -34,7 +34,7 @@ def flip(im):
     """
     Return the flipped image.
     Flip an image along the horizontal direction.
-    im: input image, (H x W x K) ndarrays 
+    im: input image, (H x W x K) ndarrays
     """
     if len(im.shape) == 3:
         return im[:, :, ::-1]
@@ -132,7 +132,7 @@ def load_meta(meta_path, mean_img_size, crop_size, color=True):
 
 def load_image(img_path, is_color=True):
     """
-    Load image and return. 
+    Load image and return.
     img_path: image path.
     is_color: is color image or not.
     """
@@ -205,7 +205,7 @@ class ImageTransformer:
 
     def set_mean(self, mean):
         if mean is not None:
-            # mean value, may be one value per channel 
+            # mean value, may be one value per channel
             if mean.ndim == 1:
                 mean = mean[:, np.newaxis, np.newaxis]
             else:
diff --git a/python/paddle/utils/make_model_diagram.py b/python/paddle/utils/make_model_diagram.py
index 40f99075de7fb2401b3b704afe1eb44dbe6072dd..52759d3ad230c3a5a5488a8bc46a2e8f8fae1025 100644
--- a/python/paddle/utils/make_model_diagram.py
+++ b/python/paddle/utils/make_model_diagram.py
@@ -15,6 +15,9 @@
 # Generate dot diagram file for the given paddle model config
 # The generated file can be viewed using Graphviz (http://graphviz.org)
 
+from __future__ import print_function
+
+import six
 import sys
 import traceback
 
@@ -61,9 +64,9 @@ def make_diagram_from_proto(model_config, dot_file):
                                              name2id[mem.link_name])
         return s
 
-    print >> f, 'digraph graphname {'
-    print >> f, 'node [width=0.375,height=0.25];'
-    for i in xrange(len(model_config.layers)):
+    print('digraph graphname {', file=f)
+    print('node [width=0.375,height=0.25];', file=f)
+    for i in six.moves.xrange(len(model_config.layers)):
         l = model_config.layers[i]
         name2id[l.name] = i
 
@@ -71,12 +74,12 @@ def make_diagram_from_proto(model_config, dot_file):
     for sub_model in model_config.sub_models:
         if sub_model.name == 'root':
             continue
-        print >> f, 'subgraph cluster_%s {' % i
-        print >> f, 'style=dashed;'
+        print('subgraph cluster_%s {' % i, file=f)
+        print('style=dashed;', file=f)
         label = '%s ' % sub_model.name
         if sub_model.reversed:
             label += '<=='
-        print >> f, 'label = "%s";' % label
+        print('label = "%s";' % label, file=f)
         i += 1
         submodel_layers.add(sub_model.name)
         for layer_name in sub_model.layer_names:
@@ -84,37 +87,41 @@ def make_diagram_from_proto(model_config, dot_file):
             lid = name2id[layer_name]
             layer_config = model_config.layers[lid]
             label = make_layer_label(layer_config)
-            print >> f, 'l%s [label="%s", shape=box];' % (lid, label)
-        print >> f, '}'
+            print('l%s [label="%s", shape=box];' % (lid, label), file=f)
+        print('}', file=f)
 
-    for i in xrange(len(model_config.layers)):
+    for i in six.moves.xrange(len(model_config.layers)):
         l = model_config.layers[i]
         if l.name not in submodel_layers:
             label = make_layer_label(l)
-            print >> f, 'l%s [label="%s", shape=box];' % (i, label)
+            print('l%s [label="%s", shape=box];' % (i, label), file=f)
 
     for sub_model in model_config.sub_models:
         if sub_model.name == 'root':
             continue
         for link in sub_model.in_links:
-            print >> f, make_link(link)
+            print(make_link(link), file=f)
         for link in sub_model.out_links:
-            print >> f, make_link(link)
+            print(make_link(link), file=f)
         for mem in sub_model.memories:
-            print >> f, make_mem(mem)
+            print(make_mem(mem), file=f)
 
-    for i in xrange(len(model_config.layers)):
+    for i in six.moves.xrange(len(model_config.layers)):
         for l in model_config.layers[i].inputs:
-            print >> f, 'l%s -> l%s [label="%s"];' % (
-                name2id[l.input_layer_name], i, l.input_parameter_name)
+            print(
+                'l%s -> l%s [label="%s"];' % (name2id[l.input_layer_name], i,
+                                              l.input_parameter_name),
+                file=f)
 
-    print >> f, '}'
+    print('}', file=f)
     f.close()
 
 
 def usage():
-    print >> sys.stderr, ("Usage: python show_model_diagram.py" +
-                          " CONFIG_FILE DOT_FILE [config_str]")
+    print(
+        ("Usage: python show_model_diagram.py" +
+         " CONFIG_FILE DOT_FILE [config_str]"),
+        file=sys.stderr)
     exit(1)
 
 
diff --git a/python/paddle/utils/merge_model.py b/python/paddle/utils/merge_model.py
index 2b100207728a8532e900992f7db4d3910e893dea..b74649e93640c3600636034d58792b8d12dffeda 100644
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
@@ -70,4 +70,4 @@ def merge_v2_model(net, param_file, output_file):
         for pname in param_names:
             params.serialize(pname, f)
 
-    print 'Generate  %s  success!' % (output_file)
+    print('Generate  %s  success!' % (output_file))
diff --git a/python/paddle/utils/plotcurve.py b/python/paddle/utils/plotcurve.py
index 27bd8157d39632913e2fa3278f3af20ddea61da7..a95e5497e23571e61e5d7652830a99efd7793083 100644
--- a/python/paddle/utils/plotcurve.py
+++ b/python/paddle/utils/plotcurve.py
@@ -44,6 +44,7 @@ To use this script to generate plot for AvgCost, error:
    python plotcurve.py -i paddle.INFO -o figure.png AvgCost error
 """
 
+import six
 import sys
 import matplotlib
 # the following line is added immediately after import matplotlib
@@ -91,7 +92,7 @@ def plot_paddle_curve(keys, inputfile, outputfile, format='png',
         sys.stderr.write("No data to plot. Exiting!\n")
         return
     m = len(keys) + 1
-    for i in xrange(1, m):
+    for i in six.moves.xrange(1, m):
         pyplot.plot(
             x[:, 0],
             x[:, i],
diff --git a/python/paddle/utils/predefined_net.py b/python/paddle/utils/predefined_net.py
index fa05f981f2b66bf55303a6f7c332c0bc9b112d29..2801f4877c079615239b92be146b3e33df16b37f 100644
--- a/python/paddle/utils/predefined_net.py
+++ b/python/paddle/utils/predefined_net.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import six
 import os
 from paddle.trainer.config_parser import *
 from paddle.utils.preprocess_img import \
@@ -112,7 +113,7 @@ def simple_conv_net(data_conf, is_color=False):
         num_classes: num of classes.
         is_color: whether the input images are color.
     """
-    for k, v in data_conf.iteritems():
+    for k, v in six.iteritems(data_conf):
         globals()[k] = v
     data_input, label_input, num_image_channels = \
         image_data_layers(image_size, num_classes, is_color, is_predict)
@@ -340,7 +341,7 @@ def small_vgg(data_conf, is_predict=False):
         num_classes: num of classes.
         is_color: whether the input images are color.
     """
-    for k, v in data_conf.iteritems():
+    for k, v in six.iteritems(data_conf):
         globals()[k] = v
     vgg_conv_net(image_size, num_classes,
                  num_layers=[2, 2, 3, 3],
diff --git a/python/paddle/utils/preprocess_img.py b/python/paddle/utils/preprocess_img.py
index 975f1e9edea161331d37afbc6b5af46286f185bf..a322f7b769a2a32df516a4b8ea04289a7f882ff2 100644
--- a/python/paddle/utils/preprocess_img.py
+++ b/python/paddle/utils/preprocess_img.py
@@ -17,9 +17,9 @@ import os
 import random
 import numpy as np
 import PIL.Image as Image
-import StringIO
-import preprocess_util
-from image_util import crop_img
+from six.moves import cStringIO as StringIO
+from . import preprocess_util
+from .image_util import crop_img
 
 
 def resize_image(img, target_size):
@@ -52,7 +52,7 @@ class DiskImage:
 
     def read_image(self):
         if self.img is None:
-            print "reading: " + self.path
+            print("reading: " + self.path)
             image = resize_image(Image.open(self.path), self.target_size)
             self.img = image
 
@@ -69,7 +69,7 @@ class DiskImage:
         convert the image into the paddle batch format.
         """
         self.read_image()
-        output = StringIO.StringIO()
+        output = StringIO()
         self.img.save(output, "jpeg")
         contents = output.getvalue()
         return contents
@@ -127,7 +127,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
             image_path = items[0]
             label_name = items[1]
             if not label_name in label_set:
-                label_set[label_name] = len(label_set.keys())
+                label_set[label_name] = len(list(label_set.keys()))
             img = DiskImage(path=image_path, target_size=self.target_size)
             label = preprocess_util.Lablel(
                 label=label_set[label_name], name=label_name)
@@ -144,7 +144,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
             return create_dataset_from_list(path)
         label_set = preprocess_util.get_label_set_from_dir(path)
         data = []
-        for l_name in label_set.keys():
+        for l_name in list(label_set.keys()):
             image_paths = preprocess_util.list_images(
                 os.path.join(path, l_name))
             for p in image_paths:
diff --git a/python/paddle/utils/preprocess_util.py b/python/paddle/utils/preprocess_util.py
index 1d17a488243eb81e46bea3ead686efd021499e22..05b2067d01a2c544d7f5bd68320e79c805282286 100644
--- a/python/paddle/utils/preprocess_util.py
+++ b/python/paddle/utils/preprocess_util.py
@@ -14,7 +14,7 @@
 
 import os
 import math
-import cPickle as pickle
+import six.moves.cPickle as pickle
 import random
 import collections
 
@@ -169,7 +169,7 @@ class Dataset:
             random.shuffle(keyvalue_indices[k])
 
         num_data_per_key_batch = \
-            math.ceil(num_per_batch / float(len(keyvalue_indices.keys())))
+            math.ceil(num_per_batch / float(len(list(keyvalue_indices.keys()))))
 
         if num_data_per_key_batch < 2:
             raise Exception("The number of data in a batch is too small")
@@ -182,8 +182,8 @@ class Dataset:
                 end_idx = int(
                     min(begin_idx + num_data_per_key_batch,
                         len(keyvalue_indices[k])))
-                print "begin_idx, end_idx"
-                print begin_idx, end_idx
+                print("begin_idx, end_idx")
+                print(begin_idx, end_idx)
                 for idx in range(begin_idx, end_idx):
                     permuted_data.append(self.data[keyvalue_indices[k][idx]])
                 keyvalue_readpointer[k] = end_idx
@@ -357,6 +357,6 @@ class DatasetCreater(object):
             data_batcher.create_batches_and_list(
                 self.output_path, self.train_list_name, self.test_list_name,
                 self.label_set_name)
-            self.num_classes = len(train_label_set.keys())
+            self.num_classes = len(list(train_label_set.keys()))
             self.create_meta_file(train_data)
         return out_path
diff --git a/python/paddle/utils/show_pb.py b/python/paddle/utils/show_pb.py
index 20614826d1d01f50a2bb54a840e2c584fb93b247..da7a71a665aea4d93d366e8508f438a9aba88e94 100644
--- a/python/paddle/utils/show_pb.py
+++ b/python/paddle/utils/show_pb.py
@@ -15,6 +15,8 @@
 Show the content of proto buffer data file of PADDLE
 """
 
+from __future__ import print_function
+
 import os
 import sys
 from google.protobuf.internal.decoder import _DecodeVarint
@@ -39,7 +41,7 @@ def read_proto(file, message):
 
 
 def usage():
-    print >> sys.stderr, "Usage: python show_pb.py PROTO_DATA_FILE"
+    print("Usage: python show_pb.py PROTO_DATA_FILE", file=sys.stderr)
     exit(1)
 
 
@@ -50,8 +52,8 @@ if __name__ == '__main__':
     f = open(sys.argv[1])
     header = DataFormat.DataHeader()
     read_proto(f, header)
-    print header
+    print(header)
 
     sample = DataFormat.DataSample()
     while read_proto(f, sample):
-        print sample
+        print(sample)
diff --git a/python/paddle/utils/torch2paddle.py b/python/paddle/utils/torch2paddle.py
index 91490111a1144ae25ed6566ff1c83db4f7954d33..398d3aa4e02cc74b7885f7e676937d7fd254bc5e 100644
--- a/python/paddle/utils/torch2paddle.py
+++ b/python/paddle/utils/torch2paddle.py
@@ -24,7 +24,7 @@ import sys
 import struct
 import numpy as np
 import torchfile
-import cPickle as pickle
+import six.moves.cPickle as pickle
 import argparse
 
 
@@ -48,7 +48,7 @@ def save_net_parameters(layers, params, output_path):
         biases = params[i * 2 + 1]
         weight_file = os.path.join(output_path, '_%s.w0' % layers[i])
         biases_file = os.path.join(output_path, '_%s.wbias' % layers[i])
-        print "Saving for layer %s." % layers[i]
+        print("Saving for layer %s." % layers[i])
         save_layer_parameters(weight_file, [weight])
         save_layer_parameters(biases_file, biases)
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 4a6cddbbea4903f5a65123aa19b7e978b335f32b..786c9f2e39880b68700b8acb94b3d35a48323958 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -159,18 +159,20 @@ if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_LIB}', libs_path)
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)
     package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
-if '${WITH_MKLDNN}' == 'ON':
-    # TODO(typhoonzero): use install_name_tool to patch mkl libs once
-    # we can support mkl on mac.
-    #
-    # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
-    # The reason is that all thirdparty libraries in the same directory,
-    # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
-    command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
-    if os.system(command) != 0:
-        raise Exception("patch libmkldnn.so failed, command: %s" % command)
-    package_data['paddle.libs']+=['libmkldnn.so.0']
-    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+if '${CMAKE_BUILD_TYPE}' == 'Release':
+    # only change rpath in Release mode.
+    if '${WITH_MKLDNN}' == 'ON':
+        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
+        # we can support mkl on mac.
+        #
+        # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
+        # The reason is that all thirdparty libraries in the same directory,
+        # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
+        command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
+        if os.system(command) != 0:
+            raise Exception("patch libmkldnn.so failed, command: %s" % command)
+        package_data['paddle.libs']+=['libmkldnn.so.0']
+        shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 # remove unused paddle/libs/__init__.py
 os.remove(libs_path+'/__init__.py')
 package_dir['paddle.libs']=libs_path
@@ -179,20 +181,22 @@ package_dir['paddle.libs']=libs_path
 # The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
 # core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
-if "@APPLE@" == "1":
-    command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-else:
-    command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-if os.system(command) != 0:
-    raise Exception("patch core.so failed, command: %s" % command)
-if '${WITH_FLUID_ONLY}'== 'OFF':
-    # change rpath of _swig_paddle.so.
+if '${CMAKE_BUILD_TYPE}' == 'Release':
+    # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed.
     if "@APPLE@" == "1":
-        command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+        command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
     else:
-        command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+        command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
     if os.system(command) != 0:
-        raise Exception("patch _swig_paddle.so failed, command: %s" % command)
+        raise Exception("patch core.so failed, command: %s" % command)
+    if '${WITH_FLUID_ONLY}'== 'OFF':
+        # change rpath of _swig_paddle.so.
+        if "@APPLE@" == "1":
+            command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+        else:
+            command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+        if os.system(command) != 0:
+            raise Exception("patch _swig_paddle.so failed, command: %s" % command)
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
index 93591fa9ddad8a78df344e1e912a5f1c7e93dfa4..eb4b477dcb538f7ba17cfc54057a97c9669a6916 100644
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -28,7 +28,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
 PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
 
 # Libraries that are allowed as part of the manylinux1 profile
-MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel"
 
 # Get build utilities
 MY_DIR=$(dirname "${BASH_SOURCE[0]}")
@@ -105,7 +105,7 @@ curl-config --features
 rm -rf /usr/local/ssl
 
 # Install patchelf (latest with unreleased bug fixes)
-curl -sLO https://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
+curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
 check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
 tar -xzf patchelf-0.9njs2.tar.gz
 (cd patchelf-0.9njs2 && ./configure && make && make install)
diff --git a/tools/manylinux1/build_scripts/install_nccl2.sh b/tools/manylinux1/build_scripts/install_nccl2.sh
index 282c5c290da14bd3c04346ab01fdb48423c23f88..43a99d8287bbaa13ff75d9f25972a6335ae0754a 100644
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
@@ -21,5 +21,5 @@ for sub_deb in $DEBS; do
   ar x $sub_deb && tar xf data.tar.xz
 done
 mv -f usr/include/nccl.h /usr/local/include/
-mv -f usr/lib/libnccl* /usr/local/lib/
+mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
 rm -rf $DIR
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 2d6a3cf8a97a3bbaa69b66f5343c54b750624329..9b9f165e7368364bbb0a78d6dcbbe4be0d6bf98b 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+
 import unittest
 import os
 import sys