diff --git a/CMakeLists.txt b/CMakeLists.txt
index 32ec1608f352b1972b293598f9296b772daa4b84..420dafee65f8c3464ea93d38c338a8219cb02e08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,7 @@ include(simd)
 
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
@@ -181,6 +182,11 @@ if(WITH_GPU)
     include(cuda)
 endif(WITH_GPU)
 
+if(WITH_AMD_GPU)
+    find_package(HIP)
+    include(hip)
+endif(WITH_AMD_GPU)
+
 if(WITH_MKLML)
     list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 786f224608f7d41c438411de0e09fedbcf2264b8..8b29227cfab2a36d5b9f6d17b837b33da8d2a92e 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -18,12 +18,13 @@ import sys
 import time
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.profiler as profiler
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
 import argparse
 import functools
 import os
+from paddle.fluid import debuger
 
 
 def str2bool(v):
@@ -182,28 +183,27 @@ def main():
             start_time = time.time()
             num_samples = 0
             train_pass_acc.reset()
-            with profiler.profiler("CPU", 'total') as prof:
-                for batch_id, data in enumerate(train_reader()):
-                    ts = time.time()
-                    img_data = np.array(
-                        map(lambda x: x[0].reshape(data_shape), data)).astype(
-                            "float32")
-                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                    y_data = y_data.reshape([-1, 1])
-
-                    loss, acc, b_size = exe.run(
-                        trainer_prog,
-                        feed={"pixel": img_data,
-                              "label": y_data},
-                        fetch_list=[avg_cost, batch_acc, batch_size])
-                    iters += 1
-                    num_samples += len(data)
-                    train_pass_acc.add(value=acc, weight=b_size)
-                    print(
-                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
-                        % (pass_id, iters, loss, acc,
-                           len(data) / (time.time() - ts))
-                    )  # The accuracy is the accumulation of batches, but not the current batch.
+            for batch_id, data in enumerate(train_reader()):
+                ts = time.time()
+                img_data = np.array(
+                    map(lambda x: x[0].reshape(data_shape), data)).astype(
+                        "float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = y_data.reshape([-1, 1])
+
+                loss, acc, b_size = exe.run(
+                    trainer_prog,
+                    feed={"pixel": img_data,
+                          "label": y_data},
+                    fetch_list=[avg_cost, batch_acc, batch_size])
+                iters += 1
+                num_samples += len(data)
+                train_pass_acc.add(value=acc, weight=b_size)
+                print(
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+                    % (pass_id, iters, loss, acc,
+                       len(data) / (time.time() - ts))
+                )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
             pass_train_acc = train_pass_acc.eval()
@@ -254,9 +254,7 @@ def main():
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
                                                     pserver_prog)
-            print("starting server side startup")
             exe.run(pserver_startup)
-            print("starting parameter server...")
             exe.run(pserver_prog)
         elif training_role == "TRAINER":
             # Parameter initialization
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
index 996df0e314b867ea8de618dfd3977f490fbe8372..2d220478acae46566760209dbc012cff316946aa 100644
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
@@ -292,14 +292,18 @@ def run_benchmark(cluster_spec, server):
         return np.mean(test_accs)
 
     config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+        intra_op_parallelism_threads=1,
+        inter_op_parallelism_threads=1,
+        log_device_placement=True)
     config.gpu_options.allow_growth = True
 
     hooks = [tf.train.StopAtStepHook(last_step=1000000)]
 
     with tf.train.MonitoredTrainingSession(
-            master=server.target, is_chief=(args.task_index == 0),
-            hooks=hooks) as sess:
+            master=server.target,
+            is_chief=(args.task_index == 0),
+            hooks=hooks,
+            config=config) as sess:
         iters, num_samples, start_time = 0, 0, 0.0
         for pass_id in range(args.num_passes):
             # train
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 0f76f55270592c5625a9624b33f4c0f82efdc627..f726405c4773994f6ca6509e5218750805b03995 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -57,11 +57,7 @@ if(NOT WITH_GOLANG)
     add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)
 
-if(NOT WITH_GPU)
-    add_definitions(-DHPPL_STUB_FUNC)
-
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
+if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
 
     FIND_PACKAGE(CUDA REQUIRED)
@@ -84,7 +80,14 @@ else()
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
+elseif(WITH_AMD_GPU)
+    add_definitions(-DPADDLE_WITH_HIP)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
+else()
+    add_definitions(-DHPPL_STUB_FUNC)
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+endif()
 
 if (WITH_MKLML AND MKLML_IOMP_LIB)
     message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index d9cd264b49d546c35a2c57a82ead83ea654b60ae..10662fc96704685f030a5d76c6857d4bc20a63d9 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -24,7 +24,7 @@ set(BOOST_PROJECT       "extern_boost")
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR           "boost_1_41_0")
-set(BOOST_URL           "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz")
+set(BOOST_URL           "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 6a701e076c95372f903a09d35d4208ee73bd584c..73d70c34dce8bedd9e62519c207e5be3dcf7dba3 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -4,18 +4,33 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
 
-ExternalProject_Add(
-    extern_eigen3
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
-    PREFIX          ${EIGEN_SOURCE_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   ""
-    TEST_COMMAND      ""
-)
+if(WITH_AMD_GPU)
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
+        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+else()
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+        GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+endif()
 
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 471e3929069d0d28105404b4f0f6baa303faf0e0..c749c97f13649fe8432091414b56f7d0ea8ace8b 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -317,6 +317,82 @@ function(nv_test TARGET_NAME)
   endif()
 endfunction(nv_test)
 
+function(hip_library TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_library_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    if(hip_library_SRCS)
+      if (hip_library_SHARED OR hip_library_shared) # build *.so
+        add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+      else()
+        add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
+	find_fluid_modules(${TARGET_NAME})
+      endif()
+      if (hip_library_DEPS)
+	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
+	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${hip_library_SRCS})
+	string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+	if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
+    else(hip_library_SRCS)
+      if (hip_library_DEPS)
+	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+      else()
+	message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(hip_library_SRCS)
+  endif()
+endfunction(hip_library)
+
+function(hip_binary TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
+    if(hip_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+    endif()
+  endif()
+endfunction(hip_binary)
+
+function(hip_test TARGET_NAME)
+  if (WITH_AMD_GPU AND WITH_TESTING)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_test_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
+    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+  endif()
+endfunction(hip_test)
+
 function(go_library TARGET_NAME)
   set(options STATIC static SHARED shared)
   set(oneValueArgs "")
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..bfe491bd6b7602959d3dd60bd06c67993593cc9b
--- /dev/null
+++ b/cmake/hip.cmake
@@ -0,0 +1,43 @@
+if(NOT WITH_AMD_GPU)
+    return()
+endif()
+
+include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hipblas/include")
+include_directories("/opt/rocm/hiprand/include")
+include_directories("/opt/rocm/rocrand/include")
+include_directories("/opt/rocm/rccl/include")
+include_directories("/opt/rocm/thrust")
+
+list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
+
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+
+if(WITH_DSO)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
+endif(WITH_DSO)
+
+if(WITH_DOUBLE)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
+endif(WITH_DOUBLE)
+
+if(WITH_TESTING)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
+endif(WITH_TESTING)
+
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+
+if("x${HCC_HOME}" STREQUAL "x")
+  set(HCC_HOME "/opt/rocm/hcc")
+endif()
+
+set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
+set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
+
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index da67701ec1af57df742dce105990cffa40f45d7c..a9b27933a5307aabeaf150aeb859e869197229f5 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(v2)
+add_subdirectory(fluid)
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cc999f5a8d70a2239ea3b130e9da172d5f681c65
--- /dev/null
+++ b/doc/fluid/CMakeLists.txt
@@ -0,0 +1,49 @@
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output directory
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+    "${BINARY_BUILD_DIR_CN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_docs_cn
+                  html
+                  ${BINARY_BUILD_DIR_CN}
+                  ${SPHINX_CACHE_DIR_CN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_CN})
diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9276236f9fd511bde3570a8c88b437119911d60a
--- /dev/null
+++ b/doc/fluid/build_and_install/index_cn.rst
@@ -0,0 +1,2 @@
+安装与使用
+------------
diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cc1e61a58a026a0f5c3b106875a8a86dc9cba613
--- /dev/null
+++ b/doc/fluid/build_and_install/index_en.rst
@@ -0,0 +1,2 @@
+Build and Install
+------------
diff --git a/doc/design/images/multiple_reader.png b/doc/fluid/design/concepts/images/multiple_reader.png
similarity index 100%
rename from doc/design/images/multiple_reader.png
rename to doc/fluid/design/concepts/images/multiple_reader.png
diff --git a/doc/design/images/readers.png b/doc/fluid/design/concepts/images/readers.png
similarity index 100%
rename from doc/design/images/readers.png
rename to doc/fluid/design/concepts/images/readers.png
diff --git a/doc/fluid/design/concurrent/images/select_op_workflow.png b/doc/fluid/design/concurrent/images/select_op_workflow.png
new file mode 100644
index 0000000000000000000000000000000000000000..719ed76f9d542d6c4f20c30f27656bb53325aa85
Binary files /dev/null and b/doc/fluid/design/concurrent/images/select_op_workflow.png differ
diff --git a/doc/fluid/design/concurrent/select_op.md b/doc/fluid/design/concurrent/select_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..52c226bc94a4e8bfc5588705d7f65328840e91cc
--- /dev/null
+++ b/doc/fluid/design/concurrent/select_op.md
@@ -0,0 +1,265 @@
+# select_op Design
+
+## Introduction
+
+In golang, the [**select**](https://golang.org/ref/spec#Select_statements) 
+statement lets a goroutine wait on multiple communication operations at the 
+same time. The **select** blocks until one of its cases can run, then 
+executes the case.  If multiple cases are ready to run, then one case is 
+choosen at random to be executed.
+
+With the introduction of CSP for Paddle, we mimic this behavior by 
+creating a ***select_op***.
+
+## How to use it
+
+The **select_op** is available as a c++ operator.  However most users
+will prefer to use the much simplier Python API.
+
+- **fluid.Select()**: Creates a select operator and adds it to the current
+block within the main program.  Also creates a sub block and adds it to the 
+main program.  This sub block is used to hold all variables and operators 
+used by the case statements.
+ 
+Within the select block, users can add cases by 
+calling **select.case** or **select.default** method.
+
+- **fluid.Select.case(channel_action, channel, result_variable)**: Represents
+a fluid channel send/recv case.  This method creates a SelectCase block
+guard and adds it to the Select block.  The arguments into this method tells
+the select which channel operation to listen to.
+
+- **fluid.Select.default()**: Represents the fluid default case.  This default
+case is executed if none of the channel send/recv cases are available to
+execute.
+
+**Example:**
+```
+ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+            
+x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
+ 
+while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
+while_op = While(cond=while_cond)    
+ 
+with while_op.block():
+    with fluid.Select() as select:
+        with select.case(fluid.channel_send, channel, x):
+            # Send x, then perform Fibonacci calculation on x and y
+            x_tmp = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+            assign(input=x, output=x_tmp)
+            assign(input=y, output=x)
+            assign(elementwise_add(x=x_tmp, y=y), output=y)
+        with select.case(fluid.channel_recv, quit_channel, result2):
+            # Exit out of While loop
+            while_false = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
+            helper = layer_helper.LayerHelper('assign')
+            helper.append_op(
+                type='assign',
+                inputs={'X': [while_false]},
+                outputs={'Out': [while_cond]})
+```
+
+## How it Works
+
+### Program Description
+
+```
+blocks {
+  idx: 0
+  ...
+  // Create "case_to_execute" variable
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_110.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: -1.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 2
+    }
+  }
+  // Create "select" operator.
+  // inputs: 
+  //   X: All input variables used by operators within the select block
+  //   case_to_execute: Variable filled in by select_op when it determines
+  //     which case to execute.
+  //  
+  // outputs:
+  //   Out: All output variables referenced by operators within select block. 
+  // 
+  // attrs:
+  //   sub_block: The block id containing the select "cases"
+  //   cases:  Serialized list of all cases in the select op. 
+  //     Each case is serialized as: '<index>,<type>,<channel>,<value>'
+  //     where type is 0 for default, 1 for send, and 2 for receive.
+  //     No channel and values are needed for default cases.
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_103.tmp_0"
+      arguments: "fill_constant_104.tmp_0"
+    }
+    inputs {
+      parameter: "case_to_execute"
+      arguments: "fill_constant_110.tmp_0"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_110.tmp_0"
+    }    
+    type: "select"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+    attrs {
+      name: "cases"
+      type: STRINGS
+      strings: "0,1,channel_101,fill_constant_109.tmp_0"
+      strings: "1,2,channel_102,fill_constant_108.tmp_0"
+    }
+  }
+  ...
+}
+```
+
+The python select API will add the **select_op** to the current block.  In addition, it will
+iterate through all it's case statements and add any input variables required by case statements
+into **X**.  It will also create a temp variable called **case_to_execute**.  This variable is
+filled in by the select_op after it has completed processing the case statements.
+
+If there are no available cases to execute (ie: all cases are blocked on channel operations, and
+there is no default statement), then the select_op will block the current thread.  The thread will 
+unblock once there is a channel operation affecting one of the case statements, at which point, the
+**select_op** will set the **case_to_execute** variable to the index of the case to execute.
+
+Finally the select_op will call executor.run on the **sub_block**.
+
+```
+blocks {
+  idx: 1
+  parent_idx: 0
+  ...
+  // Fill a tensor with the case index (ie: 0,1,2,3,ect.)
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_111.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 0.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 2
+    }
+  }
+  // Create an "equal" operator to compare the case index with the "case_to_execute"
+  // tensor (which was filled in by the select op).
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_111.tmp_0"  // case 0
+    }
+    inputs {
+      parameter: "Y"
+      arguments: "fill_constant_110.tmp_0"  // case_to_execute
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "equal_0.tmp_0"
+    }
+    type: "equal"
+    attrs {
+      name: "axis"
+      type: INT
+      i: -1
+    }
+  }
+  // Use the output of the "equal" operator as a condition for the "conditional_block".
+  // If the condition evaluates to true, then execute the "sub_block" (which represents
+  // the select case's body)
+  ops {
+    inputs {
+      parameter: "Params"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "equal_0.tmp_0"
+    }
+    outputs {
+      parameter: "Out"
+    }
+    outputs {
+      parameter: "Scope"
+      arguments: "_generated_var_0"
+    }
+    type: "conditional_block"
+    attrs {
+      name: "is_scalar_condition"
+      type: BOOLEAN
+      b: true
+    }
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 4
+    }
+  }
+  ...
+  // Repeat the above operators for each case statements inside the select body
+}
+
+```
+
+Cases are represented by a **conditional_block operator**, whose's condition is set as the output of 
+equal(**case_to_execute**, **case_index**).  Since each case index is unique in this sub-block, 
+only one case will be executed.
+
+### select_op flow
+
+<p align="center">
+<img src="./images/select_op_workflow.png"/><br/>
+</p>
+
+The select algorithm is inspired by golang's select routine.  Please refer to 
+http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
+
+## Backward Pass
+
+TODO
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..e543adf0f97cc6b47415b807d7a1ed1effec9b22
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -0,0 +1,128 @@
+## Design Doc: Distributed Lookup Table Operator
+
+A lookup table operator in PaddlePaddle where the table could be out
+of the memory of a computer.
+
+## Background
+
+A lookup table operator is well-used in deep learning for learning the
+representation, or the
+[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of
+symbols.
+
+### The Forward Algorithm
+
+The forward algorithm of the lookup table is a multiplication of the
+input vector x and the lookup table matrix W:
+
+$$y = x * W$$
+
+When x is a sparse vector of symbols, the above multiplication
+simplifies into looking up rows in W that correspond to symbols in x,
+denoted by W(x).  Please be aware that W could be huge and out of the
+memory, so we'd need a distributed storage service, which supports the
+lookup of rows.
+
+The following figure illustrates the multiplication of x with two
+non-zero elements, or say, two symbols, and a lookup table W:
+
+![lookup table](./src/lookup_table.png)
+
+### The Backward Algorithm
+
+The backward algorithm computes W'(x) using W(x).  W'(x) has the same
+scale of size as W(x) and is much smaller than W.
+
+To optimize W given W', we can do simple SGD update:
+
+$$W = f(W') = \lambda * W'$$
+
+or some more sophisticated algorithms that rely on both W' and W:
+
+$$W = f(W, W')$$
+
+The following figure illustrates the backward pass of the lookup
+operator: ![lookup table training](./src/lookup_table_training.png)
+
+## Distributed Storage Service
+
+The forward algorithm requires a distributed storage service for W.
+The backward algorithm prefers that the storage system can apply the
+optimization algorithm on W.  The following two sections describe two
+solutions -- the former doesn't require that the storage service can
+do optimization, the latter does.
+
+### Storage Service Doesn't Optimize
+
+In this design, we use highly-optimized distributed storage, e.g.,
+memcached, as the storage service, and we run the optimization
+algorithm on parameter servers of PaddlePaddle.  The following figure
+illustrates the training process.
+
+<!--
+Note: please update the following URL when update this digraph.
+<img src='https://g.gravizo.com/svg?
+digraph G {
+  rankdir="LR";
+  subgraph cluster1 {
+  P1 [label="pserver 1"];
+  P2 [label="pserver 2"];
+  T1 [label="trainer 1"];
+  T2 [label="trainer 2"];
+  T3 [label="trainer 3"];
+  }
+  KV [label="memcached"];
+  T1 -> P1;
+  T1 -> P2;
+  T2 -> P1;
+  T2 -> P2;
+  T3 -> P1;
+  T3 -> P2;
+  P1 -> KV [color=gray, weight=0.1];
+  KV -> P1 [color=gray, weight=0.1];
+  P2 -> KV [color=gray, weight=0.1];
+  KV -> P2 [color=gray, weight=0.1];
+  KV -> T1 [color=gray, weight=0.1];
+  KV -> T2 [color=gray, weight=0.1];
+  KV -> T3 [color=gray, weight=0.1];
+}
+)
+'/>
+-->
+
+<img src='https://g.gravizo.com/svg?%20digraph%20G%20{%20rankdir=%22LR%22;%20subgraph%20cluster1%20{%20P1%20[label=%22pserver%201%22];%20P2%20[label=%22pserver%202%22];%20T1%20[label=%22trainer%201%22];%20T2%20[label=%22trainer%202%22];%20T3%20[label=%22trainer%203%22];%20}%20KV%20[label=%22memcached%22];%20T1%20-%3E%20P1;%20T1%20-%3E%20P2;%20T2%20-%3E%20P1;%20T2%20-%3E%20P2;%20T3%20-%3E%20P1;%20T3%20-%3E%20P2;%20P1%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P1%20[color=gray,%20weight=0.1];%20P2%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T1%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T3%20[color=gray,%20weight=0.1];%20}'/>
+
+Each trainer runs the forward and backward passes using their local
+data:
+
+1. In the forward pass, when a trainer runs the forward algorithm of a
+   lookup operator, it retrieves W(x) from the storage service.
+1. The trainer computes W'(x) in the backward pass using W(x).
+
+During the global update process:
+
+1. Each trainer uploads its W'(x) to parameter servers.
+1. The parameter server runs the optimization algorithm, e.g., the
+   Adam optimization algorithm, which requires that
+   1. The parameter server retrieves W(x) from memcached, and
+   1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
+      W'(x))$ to memcached, where $f$ denotes the optimization
+      algorithm.
+
+### Storage Service Does Optimize
+
+This design is very similar to the above one, except that the
+optimization algorithm $f$ runs on the storage service.
+
+- Pro: parameter servers do not retrieve W(x) from the storage
+  service, thus saves half network communication.
+- Con: the storage service needs to be able to run the optimization
+  algorithm.
+
+## Conclusion
+
+Let us do the "storage service does not optimize" solution first, as a
+baseline at least, because it is easier to use a well-optimized
+distributed storage service like memcached.  We can do the "storage
+service does optimize" solution later or at the same time, which, if
+implemented carefully, should have better performance than the former.
diff --git a/doc/fluid/design/dist_train/src/lookup_table.png b/doc/fluid/design/dist_train/src/lookup_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..72dfe3547f731d0d090338afb206b0549dff472e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table.png differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table_training.png b/doc/fluid/design/dist_train/src/lookup_table_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table_training.png differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
index 2f4854793fa1f0b02e4dc17b51a48a972be61c06..6f414e5549b149bc88fb252085ff56dbb06730f8 100644
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
 ## RNN Algorithm Implementation
 
 <p align="center">
-<img src="./images/rnn.jpg"/>
+<img src="./rnn.jpg"/>
 </p>
 
 The above diagram shows an RNN unrolled into a full network.
@@ -22,7 +22,7 @@ There are several important concepts here:
 There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
 
 <p align="center">
-<img src="./images/rnn.png"/><br/>
+<img src="./rnn.png"/><br/>
 Figure 2 illustrates the RNN's data flow
 </p>
 
@@ -49,7 +49,7 @@ or copy the memory value of the previous step to the current ex-memory variable.
 
 ### Usage in Python
 
-For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md).
 
 We can define an RNN's step-net using a Block:
 
@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
 The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
 
 <p align="center">
-<img src="./images/2_level_rnn.png"/>
+<img src="./2_level_rnn.png"/>
 </p>
 
 ```python
@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
 
 
 <p align="center">
-<img src="images/rnn_2level_data.png"/>
+<img src="./rnn_2level_data.png"/>
 </p>
diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f1887be6901653d4263d711d78b626d2abfd45c9
--- /dev/null
+++ b/doc/fluid/design/index_cn.rst
@@ -0,0 +1,2 @@
+设计思想
+------------
diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..18a4b4122f6e3f0096676f34ffea8a80aa9b6696
--- /dev/null
+++ b/doc/fluid/design/index_en.rst
@@ -0,0 +1,2 @@
+Design
+------------
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
index f78fa8c1914124f33b9730f918c8887ced4f8d9d..110b7d78bf12ac8328fb3a913e4386e75d63c995 100644
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -103,7 +103,7 @@ In computability theory, a system of data-manipulation rules, such as a programm
 
 There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
 
-There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
 
 Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
 
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e1edf079fa0f85eb7f6709fd945fffae88625d01
--- /dev/null
+++ b/doc/fluid/dev/index_cn.rst
@@ -0,0 +1,2 @@
+开发标准
+------------
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..faf9dfcd315fddc4774c3717b41086fa6c6bf85a
--- /dev/null
+++ b/doc/fluid/dev/index_en.rst
@@ -0,0 +1,4 @@
+Development
+------------
+
+This is Development page
diff --git a/doc/fluid/faq/index_cn.rst b/doc/fluid/faq/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..395c1109891b5a00eab6f0b44d855658def7fdd6
--- /dev/null
+++ b/doc/fluid/faq/index_cn.rst
@@ -0,0 +1,2 @@
+FAQ
+------------
diff --git a/doc/fluid/faq/index_en.rst b/doc/fluid/faq/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..395c1109891b5a00eab6f0b44d855658def7fdd6
--- /dev/null
+++ b/doc/fluid/faq/index_en.rst
@@ -0,0 +1,2 @@
+FAQ
+------------
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c4d8525f23ee18cb7f41ab2f0d148fc1dcc852b2
--- /dev/null
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -0,0 +1,4 @@
+新手入门
+------------
+
+新手入门
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a4efd05e2fd94ac0e2cbbc8603e6b0261b7e787f
--- /dev/null
+++ b/doc/fluid/getstarted/index_en.rst
@@ -0,0 +1,4 @@
+GET STARTED
+------------
+
+This is get started page
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b6f767869aaa800c122c8e7a06a1413e48e10e0
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -0,0 +1,145 @@
+# Fluid 分布式版本使用指南
+本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行，以及将单机训练脚本改造成支持集群训练的版本
+
+## 准备工作
+* 可用的集群
+
+    包含一个或多个计算节点的集群，每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址，集群内的所有计算节点可以通过网络相互通信。
+* 安装PaddlePaddle Fluid with Distribution版本
+
+    所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。
+
+    **注意：**当前对外提供的PaddlePaddle版本并不支持分布式，需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。
+    cmake编译命令中需要将WITH_DISTRIBUTE设置为ON，下面是一个cmake编译指令示例：
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
+## 更新训练脚本
+这里，我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例，描述如何将单机训练脚本改造成支持集群训练的版本。
+### 单机训练脚本示例
+```python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+我们创建了一个简单的全连接神经网络程序，并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。
+### 介绍Parameter Server
+在非分布式版本的训练脚本中，只存在Trainer一种角色，它不仅处理常规的计算任务，也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中，由于存在多个Trainer节点进行同样的数据计算任务，因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中，我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md)
+
+**因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
+
+### 分布式训练 
+Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
+```python
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) 
+```
+将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
+```python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+... #create executor
+
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+```
+### 分布式训练脚本运行说明
+分布式任务的运行需要将表格中说明的多个参数进行赋值:
+
+| 参数名 | 值类型 | 说明 | 示例 |
+|:-------------|:------|:---------------------------------------|:-------------|
+| trainer_id | int | 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 | 0/1/2/3 |
+| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
+| trainers | int | 训练节点的总个数，>0的数字 | 4 |
+| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
+| training_role | str | 节点角色， TRAINER/PSERVER | PSERVER |
+
+**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下： 
+
+```python
+t = fluid.DistributeTranspiler()
+t.transpile(
+    optimize_ops,
+    params_grads,
+    trainer_id,
+    pservers=pserver,
+    trainers=trainers)
+if training_role == "PSERVER":
+    pserver_prog = t.get_pserver_program(server_endpoint)
+    pserver_startup = t.get_startup_program(server_endpoint, pserver_prog)
+```
+
+### Demo
+完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。
+
+第一步，进入demo代码所在目录：
+```bash
+cd /paddle/python/paddle/fluid/tests/book
+```
+
+第二步，启动Parameter Server：
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+```
+执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
+
+第三步，启动Trainer：
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+```
+由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。
+
+现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a92abad0c56a4fd821f9a6b9f4f5909504c8aaf1
--- /dev/null
+++ b/doc/fluid/howto/index_cn.rst
@@ -0,0 +1,2 @@
+进阶使用
+------------
diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..06036bdce554a96443ea1fa47c15f7670ea6089d
--- /dev/null
+++ b/doc/fluid/howto/index_en.rst
@@ -0,0 +1,4 @@
+HOW TO
+------------
+
+This is how to page
diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..be3bed4393a7346d4f2a53e2c7409ee7165fb5b6
--- /dev/null
+++ b/doc/fluid/index_cn.rst
@@ -0,0 +1,12 @@
+ PaddlePaddle Fluid
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_cn.rst
+  design/index_cn.rst
+  build_and_install/index_cn.rst
+  howto/index_cn.rst
+  dev/index_cn.rst
+  faq/index_cn.rst
diff --git a/doc/fluid/index_en.rst b/doc/fluid/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..87c831420a57b4b9ce77ecf44f7f4d0feec833a6
--- /dev/null
+++ b/doc/fluid/index_en.rst
@@ -0,0 +1,12 @@
+ PaddlePaddle Fluid
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_en.rst
+  design/index_en.rst
+  build_and_install/index_en.rst
+  howto/index_en.rst
+  dev/index_en.rst
+  faq/index_en.rst
diff --git a/doc/v2/api/fluid/optimizer.rst b/doc/v2/api/fluid/optimizer.rst
index 9b165f870459b4f9ef2efe24f5604a3fcb96f7f3..2f820595c35c2bccd6a5c8a20c60d796c04c8e97 100644
--- a/doc/v2/api/fluid/optimizer.rst
+++ b/doc/v2/api/fluid/optimizer.rst
@@ -47,3 +47,10 @@ DecayedAdagrad
     :members:
     :noindex:
 
+Adadelta
+--------------
+
+..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
+    :members:
+    :noindex:
+
diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
index a055bb04c0c093c9159290067e5ccbd2525cd519..23615f8830e99633676c83ec5d28139a732c623c 100644
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -2,13 +2,14 @@
 如何贡献文档
 #############
 
-PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成，也可以利用paddlepaddle.org工具来编译和预览文档。
+PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的，PaddlePaddle.org工具可以帮助我们实现这一编译过程，并提供更好的预览效果。
 
 如何构建文档
 ============
 
 PaddlePaddle的文档构建有两种方式，分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具，两种方式都有各自的优点，前者方便预览，后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。
 
+我们建议使用PaddlePaddle.org工具来构建文档。
 
 使用PaddlePaddle.org工具
 ------------------------
@@ -31,7 +32,7 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
     docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
 
 注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档
 编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
 
 如果不想使用Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
@@ -56,7 +57,7 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
     python manage.py runserver
 
 工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。
 编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
 
 想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
@@ -96,7 +97,7 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 
    python -m SimpleHTTPServer 8088
 
-在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
 
 ..  image:: src/doc_en.png
     :align: center
diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst
index f3408a84269aaeef19986c220454555fbbe30e23..15ff0d34ad622f100fe98d8738b830e47c35b41b 100644
--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -2,21 +2,20 @@
 Contribute Documentation
 ########################
 
-PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
-Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
-When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+PaddlePaddle's documentation includes both Chinese and English versions. The documentation is built using the ``cmake`` command to drive the ``sphinx`` compiler. The PaddlePaddle.org tool helps us to implement this compilation process and provides better preview results.
 
-How to Build Documentations
-============
+How to build Documentation
+===========================
 
-We recommend using PaddlePaddle.org tool to build documentation
+PaddlePaddle's documentation is built in two ways: using the PaddlePaddle.org tool and without using it. Both methods have their own advantages. The former facilitates previewing, while the latter facilitates debugging by the developer. We could choose to build the documentation with Docker or without it in each of the above ways.
 
+We recommend using PaddlePaddle.org tool to build documentation.
 
-Use PaddlePaddle.org tool
---------------
-This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+Using PaddlePaddle.org tool
+-----------------------------
+This is the recommended method to build documentation, because it can automatically compile the documentation and preview the documentation directly in a web page. Note that, although you can preview the documentation in other ways, its style may not be consistent with the official website. Compiling with the PaddlePaddle.org tool produces a preview that will be consistent with the official website documentation style.
 
-The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+The PaddlePaddle.org tool can be used with Docker and Docker needs to be installed first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After installing Docker, you may use the following commands to activate the tool
 
 ..  code-block:: bash
 
@@ -32,8 +31,8 @@ The tool uses Docker, please install it on your system. Please check Docker offi
     # Please specify the working directory through -v
     docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
 
-Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run commands
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation.
 The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
 
 
@@ -58,19 +57,62 @@ If you don't wish to use Docker, you can also activate the tool through Django.
     pip install -r requirements.txt
     python manage.py runserver
 
-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+Specify the PaddlePaddle working directory for the environment variable CONTENT_DIR so that the tool could find where the working directory is.
+
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation
 The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
 
-If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
+Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ for more information about the PaddlePaddle.org tool.
+
+
+Manually Building the Documentation
+-------------------------------------
+
+Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation.
+
+[TBD]
+
+If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
+
+.. code-block:: bash
+
+   mkdir paddle
+   cd paddle
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # If you only need to build documents, use the following commands
+   make -j $processors gen_proto_py
+   make -j $processors paddle_docs paddle_docs_cn
+
+   # If you only need to build APIs, use the following commands
+   make -j $processors gen_proto_py framework_py_proto
+   make -j $processors copy_paddle_pybind
+   make -j $processors paddle_api_docs
+
+$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
+
+After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIs，it will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
 
-How to write Documentations
-============
+..  image:: src/doc_en.png
+    :align: center
+    :scale: 60 %
 
-PaddlePaddle uses `sphinx`_ to compile documentations，Please check sphinx official website for more detail.
+How to write Documentation
+===========================
 
+PaddlePaddle uses `sphinx`_ to compile documentation，Please check sphinx official website for more detail.
 
 How to update www.paddlepaddle.org
-============================
+===================================
 
 Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
 PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
diff --git a/doc/v2/faq/index_en.rst b/doc/v2/faq/index_en.rst
index 57df868f760038b25fae30df7ab20a68875ad36a..3fa220792b252617848a1c76bc2be49928e35f64 100644
--- a/doc/v2/faq/index_en.rst
+++ b/doc/v2/faq/index_en.rst
@@ -1,7 +1,8 @@
 FAQ
 ====
 
- 
+This document provides answers to some of the frequently asked questions about PaddlePaddle. If you have a question that is not covered here, please go to `PaddlePaddle Community <https://github.com/PaddlePaddle/Paddle/issues>`_ , to find an answer or submit new `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_  , we will reply in time.
+
 ..  toctree::
   :maxdepth: 1
 
diff --git a/doc/v2/getstarted/index_en.rst b/doc/v2/getstarted/index_en.rst
index 33f299be5680e0aa4a3f36638f51135503193d94..94b306895c9ddf6140cf600131930a6675a583eb 100644
--- a/doc/v2/getstarted/index_en.rst
+++ b/doc/v2/getstarted/index_en.rst
@@ -1,8 +1,19 @@
 GET STARTED
 ============
 
+If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
+
 ..  toctree::
   :maxdepth: 1
 
   quickstart_en.rst
+  
+  
+While using PaddlePaddle to build applications, please understand some basic concepts.
+
+Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
+  
+..  toctree::
+  :maxdepth: 1
+  
   concepts/use_concepts_en.rst
diff --git a/doc/v2/howto/capi/index_en.rst b/doc/v2/howto/capi/index_en.rst
index 2cbbe362fd8e06abe9866d998f60fbb3458a80b5..4ec39c9d5223442cf6872edaf7befeb5053b538e 100644
--- a/doc/v2/howto/capi/index_en.rst
+++ b/doc/v2/howto/capi/index_en.rst
@@ -1,6 +1,23 @@
-C-API Prediction Library
+C-API Inference Library
 ========================
 
+After we train a neural network, we use it to do inference. Inference is the process of preparing input data and propagating it through the model to produce the result.
+
+Compared with model training, prediction has the following features:
+
+#. Inference does not require backpropagation and parameter updates, as required during training.
+#. Labels are not needed in prediction.
+#. Most of the time, predictions need to be integrated with the user system.
+
+Therefore, the model prediction SDK needs to be designed separately and has the following features:
+
+#. The predictive SDK does not include backpropagation and parameter updates to reduce the size of the SDK.
+#. The predictive SDK needs a simple user interface for ease of use.
+#. Since the input data may have a variety of structures, the format of the input data is clearly and compactly packaged.
+#. In order to be compatible with user's system, the SDK's interface must conform to the C-standard interface.
+
+PaddlePaddle provides C-API to solve the above problem. Following are the guidelines to use the C-API:
+
 ..  toctree::
   :maxdepth: 1
 
diff --git a/doc/v2/howto/cluster/index_en.rst b/doc/v2/howto/cluster/index_en.rst
index c965d30d54e71339cf10d4b05f25e740c81adbf9..31eda57c4fb3947d92df45ea8dbb9274c9814140 100644
--- a/doc/v2/howto/cluster/index_en.rst
+++ b/doc/v2/howto/cluster/index_en.rst
@@ -2,6 +2,9 @@ Distributed Training
 ====================
 
 The effectiveness of the deep learning model is often directly related to the scale of the data: it can generally achieve better results after increasing the size of the dataset on the same model. However, it can not fit in one single computer when the amount of data increases to a certain extent. At this point, using multiple computers for distributed training is a natural solution. In distributed training, the training data is divided into multiple copies (sharding), and multiple machines participating in the training read their own data for training and collaboratively update the parameters of the overall model.
+
+Distributed training generally has framwork as shown below:
+
 .. image:: src/ps_en.png
    :width: 500
 
diff --git a/doc/v2/howto/index_en.rst b/doc/v2/howto/index_en.rst
index 2079be766f2d8e6d63ca11dccd98f80613309ceb..35ef197f58f1f865e2cdbdebb567d5637284637a 100644
--- a/doc/v2/howto/index_en.rst
+++ b/doc/v2/howto/index_en.rst
@@ -1,11 +1,37 @@
 HOW TO
-=======
+========
+
+PaddlePaddle provides the users the ability to flexibly set various command line parameters to control the model training and inference process. Please refer to the following instructions on using PaddlePaddle:
 
 ..  toctree::
   :maxdepth: 1
 
   cmd_parameter/index_en.rst
+
+PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
+
+..  toctree::
+  :maxdepth: 1
+
   cluster/index_en.rst
+
+PaddlePaddle provides a C-API for inference. We provide the following guidelines  for using the C-API:
+
+..  toctree::
+  :maxdepth: 1
+
   capi/index_en.rst
+
+PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to：
+
+..  toctree::
+  :maxdepth: 1
+
   rnn/index_en.rst
+
+How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to：
+
+..  toctree::
+  :maxdepth: 1
+
   optimization/gpu_profiling_en.rst
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index ebb083c5a477d5be91ef14be74dd9de349d07931..e06e9a2b363d1ffc6876b98bcb7304b0a54dbcaa 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -36,7 +36,7 @@ target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
   ${CAPI_PRIVATE_HEADER})
 
-add_dependencies(paddle_capi paddle_proto)
+add_dependencies(paddle_capi paddle_proto paddle_gserver)
 
 # TODO: paddle_capi_whole will be removed.
 set(PADDLE_CAPI_LAYERS_LIBS
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
index edfb41c72489113d9803c2957baed1ce44f8296d..73be5cdbe2a1f5994ecee4c415e83962f50532fe 100644
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -871,3 +871,67 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
   ch->Reset<int>(0);
   ChannelHolderDestroyUnblockSenders(ch, false);
 }
+
+// This tests that closing a channelholder many times.
+void ChannelHolderManyTimesClose(ChannelHolder *ch) {
+  const int num_threads = 15;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to send data to channel.
+  for (size_t i = 0; i < num_threads / 3; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended) {
+          int data = 10;
+          ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i]);
+  }
+
+  // Launches threads that try to receive data to channel.
+  for (size_t i = num_threads / 3; i < 2 * num_threads / 3; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          if (ch->Receive(&data)) {
+            EXPECT_EQ(data, 10);
+          }
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+
+  // Launches threads that try to close the channel.
+  for (size_t i = 2 * num_threads / 3; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          if (!ch->IsClosed()) {
+            ch->close();
+          }
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+  EXPECT_TRUE(ch->IsClosed());
+  // delete the channel
+  delete ch;
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
+  // Check for Buffered Channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(10);
+  ChannelHolderManyTimesClose(ch);
+}
diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc
index 25152054eb8452a9667bd65b4441665476c1d46d..e98e9d94bf71fe9ac226ab3ad7f587b37a5c6e33 100644
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -150,8 +150,9 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
   // Select block
   AddOp("select", {{"X", {dataChanName, quitChanName}},
                    {"case_to_execute", {"caseToExecute"}}},
-        {}, {{"sub_block", casesBlock},
-             {"cases", std::vector<std::string>{case0Config, case1Config}}},
+        {{"Out", {}}},
+        {{"sub_block", casesBlock},
+         {"cases", std::vector<std::string>{case0Config, case1Config}}},
         whileBlock);
 
   scope->Var("stepScopes");
@@ -209,9 +210,8 @@ TEST(Concurrency, Go_Op) {
 
   executor.Run(program, &scope, 0, true, true);
 
-  // After we call executor.run, the Go operator should do a channel_send to set
-  // the
-  // "result" variable to 99
+  // After we call executor.run, the Go operator should do a channel_send to
+  // set the "result" variable to 99.
   auto *finalData = tensor.data<int>();
   EXPECT_EQ(finalData[0], 99);
 }
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index a3629d70d74df6ccbf13f1598418ef7202d1e7ec..64c06687b6b905186d4efcc8441d3abef6323d53 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -14,12 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/executor.h"
 
-#include <set>
-
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -40,14 +36,13 @@ namespace {
 int kProgramId = -1;
 }  // namespace
 
-struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id)
-      : prog_(prog), block_id_(block_id) {}
+ExecutorPrepareContext::ExecutorPrepareContext(
+    const framework::ProgramDesc& prog, size_t block_id)
+    : prog_(prog), block_id_(block_id) {}
 
-  const framework::ProgramDesc& prog_;
-  size_t block_id_;
-  std::vector<std::unique_ptr<OperatorBase>> ops_;
-};
+ExecutorPrepareContext::~ExecutorPrepareContext() {
+  VLOG(5) << "destroy ExecutorPrepareContext";
+}
 
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
@@ -101,9 +96,8 @@ static void CheckTensorNANOrInf(const std::string& name,
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool create_local_scope, bool create_vars) {
   platform::RecordBlock b(block_id);
-  auto* ctx = Prepare(pdesc, block_id);
-  RunPreparedContext(ctx, scope, create_local_scope, create_vars);
-  delete ctx;
+  auto ctx = Prepare(pdesc, block_id);
+  RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
 
 // Check whether the block already has feed operators and feed_holder.
@@ -113,10 +107,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 // and feed_holder_name. Raise exception when any mismatch is found.
 // Return true if the block has feed operators and holder of matching info.
 static bool has_feed_operators(
-    BlockDesc* block, std::map<std::string, const LoDTensor*>& feed_targets,
+    const BlockDesc& block,
+    std::map<std::string, const LoDTensor*>& feed_targets,
     const std::string& feed_holder_name) {
   size_t feed_count = 0;
-  for (auto* op : block->AllOps()) {
+  for (auto* op : block.AllOps()) {
     if (op->Type() == kFeedOpType) {
       feed_count++;
       PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
@@ -135,7 +130,7 @@ static bool has_feed_operators(
         "The number of feed operators should match 'feed_targets'");
 
     // When feed operator are present, so should be feed_holder
-    auto var = block->FindVar(feed_holder_name);
+    auto var = block.FindVar(feed_holder_name);
     PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
                             feed_holder_name);
     PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
@@ -153,10 +148,10 @@ static bool has_feed_operators(
 // and fetch_holder_name. Raise exception when any mismatch is found.
 // Return true if the block has fetch operators and holder of matching info.
 static bool has_fetch_operators(
-    BlockDesc* block, std::map<std::string, LoDTensor*>& fetch_targets,
+    const BlockDesc& block, std::map<std::string, LoDTensor*>& fetch_targets,
     const std::string& fetch_holder_name) {
   size_t fetch_count = 0;
-  for (auto* op : block->AllOps()) {
+  for (auto* op : block.AllOps()) {
     if (op->Type() == kFetchOpType) {
       fetch_count++;
       PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
@@ -175,7 +170,7 @@ static bool has_fetch_operators(
         "The number of fetch operators should match 'fetch_targets'");
 
     // When fetch operator are present, so should be fetch_holder
-    auto var = block->FindVar(fetch_holder_name);
+    auto var = block.FindVar(fetch_holder_name);
     PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
                             fetch_holder_name);
     PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
@@ -190,12 +185,21 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
                    std::map<std::string, const LoDTensor*>& feed_targets,
                    std::map<std::string, LoDTensor*>& fetch_targets,
                    const std::string& feed_holder_name,
-                   const std::string& fetch_holder_name) {
+                   const std::string& fetch_holder_name, bool create_vars) {
   platform::RecordBlock b(kProgramId);
-  auto* copy_program = new ProgramDesc(program);
+  bool has_feed_ops =
+      has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
+  bool has_fetch_ops =
+      has_fetch_operators(program.Block(0), fetch_targets, fetch_holder_name);
+
+  ProgramDesc* copy_program = const_cast<ProgramDesc*>(&program);
+  if (!has_feed_ops || !has_fetch_ops) {
+    copy_program = std::unique_ptr<ProgramDesc>(new ProgramDesc(program)).get();
+  }
+
   auto* global_block = copy_program->MutableBlock(0);
 
-  if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
+  if (!has_feed_ops) {
     // create feed_holder variable
     auto* feed_holder = global_block->Var(feed_holder_name);
     feed_holder->SetType(proto::VarType::FEED_MINIBATCH);
@@ -228,7 +232,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     }
   }
 
-  if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
+  if (!has_fetch_ops) {
     // create fetch_holder variable
     auto* fetch_holder = global_block->Var(fetch_holder_name);
     fetch_holder->SetType(proto::VarType::FETCH_LIST);
@@ -251,7 +255,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     }
   }
 
-  Run(*copy_program, scope, 0, true, true);
+  Run(*copy_program, scope, 0, create_vars, create_vars);
 
   // obtain the data of fetch_targets from fetch_holder
   for (auto* op : global_block->AllOps()) {
@@ -262,19 +266,17 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
           GetFetchVariable(*scope, fetch_holder_name, idx);
     }
   }
-
-  delete copy_program;
 }
 
-ExecutorPrepareContext* Executor::Prepare(const ProgramDesc& program,
-                                          int block_id) {
+std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
+    const ProgramDesc& program, int block_id) {
   auto* ctx = new ExecutorPrepareContext(program, block_id);
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
   auto& block = program.Block(block_id);
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
-  return ctx;
+  return std::unique_ptr<ExecutorPrepareContext>(ctx);
 }
 
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
@@ -313,22 +315,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }    // if (create_vars)
 
   for (auto& op : ctx->ops_) {
-    // TODO(ty):
-    // e.g. sgd should wait for allreduce to be finished
-    // if op's input is params' grad:
-    //     sync with allreduce stream
-    // SyncMultipleStreams(op);
-
-    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
-    op->Run(*local_scope, place_);
     VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
-
-    // TODO(ty):
-    // e.g. allreduce shoudl wait for fc_grad to be finished.
-    // if op's output is params' grad:
-    //     sync with computation stream
-    //     apply allreduce on allreduce stream
-    // SyncMultipleStreams(op);
+    op->Run(*local_scope, place_);
 
     if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index e020a6e73897543e8f382fe50071b7df4774104c..7173c51c95e04ad3095f01bb24923a7a3341c517 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -22,7 +22,17 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-struct ExecutorPrepareContext;
+extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
+
+struct ExecutorPrepareContext {
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
+  ~ExecutorPrepareContext();
+
+  const framework::ProgramDesc& prog_;
+  size_t block_id_;
+  std::vector<std::unique_ptr<OperatorBase>> ops_;
+};
+
 class Executor {
  public:
   // TODO(dzhwinter) : Do not rely on this function, it will be removed
@@ -45,11 +55,11 @@ class Executor {
            std::map<std::string, const LoDTensor*>& feed_targets,
            std::map<std::string, LoDTensor*>& fetch_targets,
            const std::string& feed_holder_name = "feed",
-           const std::string& fetch_holder_name = "fetch");
+           const std::string& fetch_holder_name = "fetch",
+           bool create_vars = true);
 
- private:
-  static ExecutorPrepareContext* Prepare(const ProgramDesc& program,
-                                         int block_id);
+  static std::unique_ptr<ExecutorPrepareContext> Prepare(
+      const ProgramDesc& program, int block_id);
 
   void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                           bool create_local_scope = true,
@@ -59,7 +69,5 @@ class Executor {
   const platform::Place place_;
 };
 
-extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc
index 2e0a224ff5df749fd8c809dc88a85a1643542abf..3c0d93642ac41e8d90f9a248e81cea7a4fe12293 100644
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@@ -26,6 +26,7 @@ namespace paddle {
 namespace framework {
 
 std::once_flag gflags_init_flag;
+std::once_flag p2p_init_flag;
 
 void InitGflags(std::vector<std::string> &argv) {
   std::call_once(gflags_init_flag, [&]() {
@@ -42,6 +43,27 @@ void InitGflags(std::vector<std::string> &argv) {
   });
 }
 
+void InitP2P(int count) {
+#ifdef PADDLE_WITH_CUDA
+  std::call_once(p2p_init_flag, [&]() {
+    for (int i = 0; i < count; ++i) {
+      for (int j = 0; j < count; ++j) {
+        if (i == j) continue;
+        int can_acess = -1;
+        PADDLE_ENFORCE(cudaDeviceCanAccessPeer(&can_acess, i, j),
+                       "Failed to test P2P access.");
+        if (can_acess != 1) {
+          LOG(WARNING) << "Cannot enable P2P access from " << i << " to " << j;
+        } else {
+          cudaSetDevice(i);
+          cudaDeviceEnablePeerAccess(j, 0);
+        }
+      }
+    }
+  });
+#endif
+}
+
 void InitDevices() {
   /*Init all avaiable devices by default */
 
@@ -63,7 +85,7 @@ void InitDevices() {
   for (int i = 0; i < count; ++i) {
     places.emplace_back(platform::CUDAPlace(i));
   }
-
+  InitP2P(count);
   platform::DeviceContextPool::Init(places);
 }
 
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 6a6fa538718837a958b7d82c37f583f62f4bf96e..d99a15547b77a0e0d71b14bd1c798cd1485720b0 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -176,7 +176,7 @@ class Vector {
 
   // resize the vector
   void resize(size_t size) {
-    if (size + 1 < capacity()) {
+    if (size + 1 <= capacity()) {
       size_ = size;
     } else {
       MutableCPU();
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index 4bf78499f2fda2d2631e05ddcbbd0bc49498af1a..d57f82510833d6a0cea7009cf1f0b49543812f8d 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -104,3 +104,11 @@ TEST(mixed_vector, ForEach) {
   for (auto& v : tmp) {
   }
 }
+
+TEST(mixed_vector, Reserve) {
+  paddle::framework::Vector<int> vec;
+  vec.reserve(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 638bd0db9d7025199c31a9327b96062512aa5adb..7a4839044008338dda43f75b5ee6def500b78270 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -117,10 +117,10 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
   if (holder_ != nullptr) {
     holder_->set_type(type);
   }
-  PADDLE_ENFORCE_GT(
-      numel(), 0,
-      "When calling this method, the Tensor's numel must be larger than zero. "
-      "Please check Tensor::Resize has been called first.");
+  PADDLE_ENFORCE_GE(numel(), 0,
+                    "When calling this method, the Tensor's numel must be "
+                    "equal or larger than zero. "
+                    "Please check Tensor::Resize has been called first.");
   int64_t size = numel() * SizeOfType(type);
   /* some versions of boost::variant don't have operator!= */
   if (holder_ == nullptr || !(holder_->place() == place) ||
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 17ccca8cdcbcaabaddbbc0ca1d3ca4fdf054b0fb..aff427310f15be72f5c8d0fa1537ffa6bbe2881d 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -13,6 +13,11 @@ cc_library(paddle_fluid_shared SHARED
     SRCS io.cc
     DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
+if(NOT APPLE)
+  # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
+  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
+  set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
 
 if(WITH_TESTING)
   add_subdirectory(tests/book)
diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map
new file mode 100644
index 0000000000000000000000000000000000000000..5203784dc1fcb672eb6a26d9dfd3ffbe02e08038
--- /dev/null
+++ b/paddle/fluid/inference/paddle_fluid.map
@@ -0,0 +1,6 @@
+{
+	global:
+		*paddle*;
+	local:
+		*;
+};
diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/memory_test.cc
index ae98d0d52542c49620a5d598b1089c168d39ede4..eb27a52b254c1cda065197746eb179bbd1d7f2f1 100644
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   EXPECT_EQ(total_size, 0UL);
 
   for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
     ps[paddle::memory::Alloc(cpu, size)] = size;
 
     // Buddy Allocator doesn't manage too large memory chunk
@@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
   EXPECT_EQ(total_size, 0UL);
 
   for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
     ps[paddle::memory::Alloc(gpu, size)] = size;
 
     // Buddy Allocator doesn't manage too large memory chunk
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index d30124d4a3b89b802a4abaae07a33b76526f163d..9a11e1be7050adb1803b1fd835ffb811d9cae4cd 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -12,6 +12,8 @@ function(op_library TARGET)
     set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
     set(cc_srcs)
     set(cu_srcs)
+    set(hip_cu_srcs)
+    set(miopen_hip_cc_srcs)
     set(cu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(CUDNN_FILE)
@@ -36,10 +38,19 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
             list(APPEND cu_srcs ${TARGET}.cu)
         endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
+        endif()
         string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
             list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
         endif()
+        if(WITH_AMD_GPU)
+            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
+                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
+            endif()
+        endif()
         if(WITH_MKLDNN)
             string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
@@ -48,10 +59,14 @@ function(op_library TARGET)
         endif()
     else()
         foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.cu$")
+            if (${src} MATCHES ".*\\.hip.cu$")
+                list(APPEND hip_cu_srcs ${src})
+            elseif (${src} MATCHES ".*\\.cu$")
                 list(APPEND cu_srcs ${src})
             elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                 list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
+                list(APPEND miopen_hip_cc_srcs ${src})
             elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
                 list(APPEND mkldnn_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
@@ -76,6 +91,9 @@ function(op_library TARGET)
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
     else()
         cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
             ${op_common_deps})
@@ -88,7 +106,7 @@ function(op_library TARGET)
         endif()
     endforeach()
 
-    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
     file(READ ${TARGET}.cc TARGET_CONTENT)
@@ -114,7 +132,10 @@ function(op_library TARGET)
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
+    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -125,9 +146,19 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
     endif()
 
+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
+    endif()
+
     # pybind USE_OP_DEVICE_KERNEL for MKLDNN
     if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+      # Append first implemented MKLDNN activation operator
+      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      else()
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+      endif()
     endif()
 
     # pybind USE_OP
@@ -156,9 +187,13 @@ if(WITH_DISTRIBUTE)
     set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
 endif()
 
 op_library(cond_op DEPS framework_proto tensor net_op)
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6ff363d766db7dd97e1bc193ef7b4a095a7b7c24
--- /dev/null
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -0,0 +1,193 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "mkldnn_activation_op.h"
+#include "paddle/fluid/operators/activation_op.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+
+namespace {
+template <typename T, typename ExecContext>
+void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
+                     const T alpha = 0, const T beta = 0) {
+  PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                 "It must use CPUPlace.");
+
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+
+  // get buffers
+  const auto *src = ctx.template Input<Tensor>("X");
+  const auto *src_data = src->template data<T>();
+
+  auto *dst = ctx.template Output<Tensor>("Out");
+  const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+
+  // get memory dim
+  PADDLE_ENFORCE(src->dims().size() == 4,
+                 "Input dim must be with 4, i.e. NCHW");
+  std::vector<int> src_tz = framework::vectorize2int(src->dims());
+
+  // create memory description
+  // TODO(kbinias-intel): support more formats
+  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                         mkldnn::memory::format::nchw);
+
+  // create memory primitives
+  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data);
+  auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data);
+
+  auto forward_desc = mkldnn::eltwise_forward::desc(
+      mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
+
+  // save prim desc into global device context to be referred in backward path
+  const std::string key = ctx.op().Output("Out");
+  const std::string key_eltwise_pd = key + "@eltwise_pd";
+  auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+      forward_desc, mkldnn_engine);
+  dev_ctx.SetBlob(key_eltwise_pd, forward_pd);
+
+  auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory);
+
+  // push primitive to stream and wait until it's executed
+  std::vector<mkldnn::primitive> pipeline = {eltwise};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+
+template <typename T, typename ExecContext>
+void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
+                  const T alpha = 0, const T beta = 0) {
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+
+  // get buffers
+  const auto *x = ctx.template Input<Tensor>("X");
+  const auto *src = x->template data<T>();
+
+  auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+  const auto *diff_dst = dout->template data<T>();
+
+  auto *dx =
+      ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+  const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
+
+  // get memory dim
+  std::vector<int> src_tz = framework::vectorize2int(x->dims());
+
+  // create memory description
+  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                         mkldnn::memory::format::nchw);
+
+  // create memory primitives
+  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src);
+  auto diff_src_memory =
+      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src);
+  auto diff_dst_memory =
+      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst);
+
+  auto backward_desc =
+      mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
+
+  // retrieve eltwise primitive desc from device context
+  const std::string key = ctx.op().Input("Out");
+  const std::string key_eltwise_pd = key + "@eltwise_pd";
+  const std::shared_ptr<void> forward_pd = dev_ctx.GetBlob(key_eltwise_pd);
+  PADDLE_ENFORCE(forward_pd != nullptr,
+                 "Fail to find eltwise_pd in device context");
+  auto *p_forward_pd =
+      static_cast<mkldnn::eltwise_forward::primitive_desc *>(forward_pd.get());
+
+  auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
+      backward_desc, mkldnn_engine, *p_forward_pd);
+
+  auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory,
+                                              diff_dst_memory, diff_src_memory);
+
+  // push primitive to stream and wait until it's executed
+  std::vector<mkldnn::primitive> pipeline = {eltwise_bwd};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+}  // anonymous namespace
+
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
+  template <typename ExecContext>
+  void operator()(const ExecContext &ctx) const {
+    eltwise_forward<T>(ctx, algorithm);
+  }
+};
+
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
+  template <typename ExecContext>
+  void operator()(const ExecContext &ctx) const {
+    eltwise_grad<T>(ctx, algorithm);
+  }
+};
+
+template <typename T>
+using ReluMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
+
+template <typename T>
+using TanhMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
+
+template <typename T>
+using SqrtMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+
+template <typename T>
+using AbsMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
+
+template <typename T>
+using ReluMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
+
+template <typename T>
+using TanhMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
+
+template <typename T>
+using SqrtMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+
+template <typename T>
+using AbsMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
+  REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace,       \
+                     ops::MKLDNNActivationKernel<ops::functor<float>>);    \
+  REGISTER_OP_KERNEL(                                                      \
+      act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,               \
+      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
+
+#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)            \
+  __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
+  __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
+  __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
+  __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
+
+FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index d74c47b981e51f12d99098818c71f3f6ec455d98..979115eee0dbe157dbcf2293d914cc250b35d22e 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/mkldnn_activation_op.h"
 
 namespace paddle {
 namespace operators {
@@ -87,6 +88,9 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Relu operator");
     AddOutput("Out", "Output of Relu operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Relu Activation Operator.
 
@@ -140,6 +144,9 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Tanh operator");
     AddOutput("Out", "Output of Tanh operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Tanh Activation Operator.
 
@@ -193,6 +200,9 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Sqrt operator");
     AddOutput("Out", "Output of Sqrt operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Sqrt Activation Operator.
 
@@ -208,6 +218,9 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Abs operator");
     AddOutput("Out", "Output of Abs operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Abs Activation Operator.
 
@@ -524,11 +537,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
 REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad,
+            ops::ActivationWithMKLDNNOpGrad);
 
-REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad,
+            ops::ActivationWithMKLDNNOpGrad);
 
 REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
             tanh_shrink_grad, ops::ActivationOpGrad);
@@ -536,11 +549,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
 REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
             softshrink_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad,
+            ops::ActivationWithMKLDNNOpGrad);
 
-REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad,
+            ops::ActivationWithMKLDNNOpGrad);
 
 REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
             ops::ActivationOpGrad);
@@ -613,3 +626,14 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
                                 ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
+
+REGISTER_OP_CPU_KERNEL(relu,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ReluFunctor<float>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ReluFunctor<double>>);
+REGISTER_OP_CPU_KERNEL(
+    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                         ops::ReluGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::ReluGradFunctor<double>>);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index b2633d017623c3a6a3bab2b416009d6d7c8fc1d4..7709a551dc155e1f3cd2a19a689999608f497beb 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 
@@ -31,3 +32,16 @@ namespace ops = paddle::operators;
                                 ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
+
+REGISTER_OP_CUDA_KERNEL(
+    relu, ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                                ops::ReluFunctor<float>>,
+    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                          ops::ReluFunctor<double>>,
+    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                          ops::ReluFunctor<paddle::platform::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    relu_grad, ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                         ops::ReluGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::ReluGradFunctor<double>>);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 8f791a6ca81c13a92fd8adf0d1620203bd4cf7d6..4c575b4a7b551be2d1288f7fec0a2821fc10c40d 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,6 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -772,7 +776,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
   __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
   __macro(exp, ExpFunctor, ExpGradFunctor);                          \
-  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
   __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
   __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
   __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c95077fcbdb6b6c0da31f30b795dbe4d7d4fe6fe
--- /dev/null
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/average_accumulates_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <>
+void GetAccumulators<paddle::platform::CPUDeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t& num_updates_,
+    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
+
+  old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
+  num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
+  num_updates_ = in_num_updates->data<int64_t>()[0];
+}
+
+template <>
+void SetAccumulators<paddle::platform::CPUDeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t num_updates_,
+    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
+  out_num_updates->data<int64_t>()[0] = num_updates_;
+}
+
+class AverageAccumulatesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("param"),
+        "Input (param) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_1"),
+        "Input (sum_1) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_2"),
+        "Input (sum_2) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_3"),
+        "Input (sum_3) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_num_accumulates"),
+        "Input (in_num_accumulates) of average_accumulates op should "
+        "not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
+                   "Input (old_num_accumulates) of average_accumulates op "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_num_updates"),
+        "Input (num_updates) of average_accumulates op should not be null.");
+
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_1"),
+        "Output (sum_1) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_2"),
+        "Output (sum_2) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_3"),
+        "Output (sum_3) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
+                   "Output (num_accumulates) of average_accumulates op should "
+                   "not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
+                   "Output (old_num_accumulates) of average_accumulates op "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_num_updates"),
+        "Output (num_updates) of average_accumulates op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("param");
+
+    ctx->SetOutputDim("out_sum_1", in_dim);
+    ctx->SetOutputDim("out_sum_2", in_dim);
+    ctx->SetOutputDim("out_sum_3", in_dim);
+    ctx->SetOutputDim("out_num_accumulates", {1});
+    ctx->SetOutputDim("out_old_num_accumulates", {1});
+    ctx->SetOutputDim("out_num_updates", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("param")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("param", "(Tensor), The parameter to be accumulated.");
+    AddInput("in_sum_1",
+             "(Tensor), A tensor used to store the parameter "
+             "sums with the same shape as input(param).");
+    AddInput("in_sum_2",
+             "(Tensor), A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param). It is used to avoid loss of precision due to too "
+             "many sums.");
+    AddInput("in_sum_3",
+             "(Tensor), A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param).");
+    AddInput("in_num_accumulates",
+             "(Tensor<int64_t>), The accumulating times of current window with "
+             "shape [1].");
+    AddInput(
+        "in_old_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of previous window with "
+        "shape [1].");
+    AddInput("in_num_updates",
+             "(Tensor<int64_t>), The total number of batches used by trainning "
+             "before this batch with shape [1].");
+
+    AddOutput("out_sum_1",
+              "(Tensor), A tensor used to store the "
+              "parameter sums with the same shape as input(param).");
+    AddOutput("out_sum_2",
+              "(Tensor), A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param). It is used to avoid loss of precision due to too "
+              "many sums.");
+    AddOutput("out_sum_3",
+              "(Tensor), A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param).");
+    AddOutput(
+        "out_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of current window with "
+        "shape [1].");
+    AddOutput(
+        "out_old_num_accumulates",
+        "(Tensor<int64_t>) The accumulating times of previous window with "
+        "shape [1].");
+    AddOutput(
+        "out_num_updates",
+        "(Tensor<int64_t>), The total number of batches used by trainning "
+        "before this batch with shape [1].");
+
+    AddAttr<float>("average_window",
+                   "(float, default 0) "
+                   "The rate of average window size relative to num_updates.")
+        .SetDefault(0);
+    AddAttr<int64_t>("max_average_window",
+                     "(int64_t) "
+                     "Maximum size of average window. It suggests that the "
+                     "number of mini-batches "
+                     "in one pass is appropriate value to set.");
+    AddAttr<int64_t>("min_average_window",
+                     "(int64_t, default 10000L) "
+                     "Minimu size of average window.")
+        .SetDefault(10000L);
+
+    AddComment(R"DOC(
+AverageAccumulates Operator.
+Accumulate the sum of parameter whtin sliding window. The size of sliding window is
+determined by 'average_window', 'max_average_window' and 'min_average_window'.
+Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
+'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
+
+All the accumulators were inited to zero before training.
+
+And for a mini-batch in training, accumulators were computed as below steps:
+    num_updates += 1
+    num_accumulates += 1
+    sum_1 += param
+    if num_updates % kMaxNumAccumulates == 0:
+        sum_2 += sum_1
+        sum_1 = 0
+    if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
+        sum_3 = sum_1 + sum_2
+        sum_1 = 0
+        sum_2 = 0
+        old_num_accumulates = num_accumulates
+        num_accumulates = 0
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
+                  ops::AverageAccumulatesOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    average_accumulates,
+    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..270c46984465e5ca62eaa8da3955ce7a3eaa0c57
--- /dev/null
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/average_accumulates_op.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+template <>
+void GetAccumulators<paddle::platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t& num_updates_,
+    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
+  auto stream = ctx.cuda_device_context().stream();
+  memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
+               platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
+               sizeof(int64_t), stream);
+  memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
+               in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
+  memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
+               in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
+}
+
+template <>
+void SetAccumulators<paddle::platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t num_updates_,
+    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+  auto stream = ctx.cuda_device_context().stream();
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+
+  memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
+               platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
+               platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
+               platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    average_accumulates,
+    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f858109d1428dc67d94c253e5a39818eb2d4560d
--- /dev/null
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext>
+void GetAccumulators(const framework::ExecutionContext& ctx,
+                     int64_t& num_updates, int64_t& num_accumulates,
+                     int64_t& old_num_accumulates);
+
+template <typename DeviceContext>
+void SetAccumulators(const framework::ExecutionContext& ctx,
+                     int64_t num_updates, int64_t num_accumulates,
+                     int64_t old_num_accumulates);
+
+template <typename DeviceContext, typename T>
+class AverageAccumulatesKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // It is used to avoid loss of precision
+    static const int64_t kMaxNumAccumulates = 16384;
+    // Get accumulators from input
+    int64_t num_updates = 0;
+    int64_t num_accumulates = 0;
+    int64_t old_num_accumulates = 0;
+    GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+                                   old_num_accumulates);
+
+    // Get attrs
+    float average_window = ctx.Attr<float>("average_window");
+    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
+    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
+    min_average_window =
+        std::min<int64_t>(min_average_window, max_average_window);
+
+    // Get inputs
+    auto* param = ctx.Input<Tensor>("param");
+    auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
+    auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
+    auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
+    auto param_tensor = EigenVector<T>::Flatten(*param);
+    auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
+    auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
+    auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
+
+    // Get outputs
+    auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
+    auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
+    auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+    auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
+    auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
+    auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
+
+    // Compute
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    math::SetConstant<DeviceContext, T> constant_functor;
+    ++num_updates;
+    ++num_accumulates;
+    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
+    out_sum_2_tensor.device(place) = in_sum_2_tensor;
+    out_sum_3_tensor.device(place) = in_sum_3_tensor;
+    if (num_updates % kMaxNumAccumulates == 0) {
+      // Move the sum to a different buffer to avoid loss of precision due to
+      // too many sums.
+      out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
+                       0.0);
+    }
+    if (num_accumulates >= min_average_window &&
+        num_accumulates >= std::min<int64_t>(max_average_window,
+                                             num_updates * average_window)) {
+      //  Now the average window is too long, discard the old sum.
+      out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
+                       0.0);
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
+                       0.0);
+      old_num_accumulates = num_accumulates;
+      num_accumulates = 0;
+    }
+
+    // Set accumulators to output
+    SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+                                   old_num_accumulates);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 215ae229aff96d76fc948e19bdb42db319af65dc..36049ee6a4a0d2a251b6d10cf1ff05a9d9845089 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -80,6 +80,29 @@ class BatchNormOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("SavedVariance", {C});
     ctx->ShareLoD("X", "Y");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    // For float or float16 input tensor, the type of the scale, bias, mean,
+    // and var tensors should both be float.
+    auto bn_param_type = framework::proto::VarType::FP32;
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::ToDataType(ctx.Input<Tensor>("Scale")->type()),
+                      "Scale input should be of float type");
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::ToDataType(ctx.Input<Tensor>("Bias")->type()),
+                      "Bias input should be of float type");
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::ToDataType(ctx.Input<Tensor>("Mean")->type()),
+                      "Mean input should be of float type");
+    PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(
+                                         ctx.Input<Tensor>("Variance")->type()),
+                      "Variance input should be of float type");
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -434,12 +457,39 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
   }
 };
 
+class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("batch_norm_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+
+    op->SetInput("Scale", Input("Scale"));
+    op->SetInput("SavedMean", Output("SavedMean"));
+    op->SetInput("SavedVariance", Output("SavedVariance"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-            batch_norm_grad, ops::BatchNormGradOp);
+REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
+                  ops::BatchNormGradMaker);
+REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
+
 REGISTER_OP_CPU_KERNEL(
     batch_norm,
     ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index 2d1556efc66826ea9847de8311ccecdee0ea7871..6ceacc39924a7558e380aaf563aaf234f1bf30a5 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cfloat>
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -26,6 +27,8 @@ using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
 void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
                   int *N, int *C, int *H, int *W, int *D) {
@@ -104,8 +107,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+    // Note: PERSISTENT not implemented for inference
     CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
+        bn_param_desc_, data_desc_, is_test ? CUDNN_BATCHNORM_SPATIAL : mode_));
 
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *bias = ctx.Input<Tensor>("Bias");
@@ -118,15 +122,16 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 
     // alloc memory
     y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(dev_ctx, saved_mean, 0);
-    functor(dev_ctx, saved_variance, 0);
+    math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+        functor;
+    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
     auto handle = dev_ctx.cudnn_handle();
 
@@ -147,8 +152,10 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
           CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
           data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
-          bn_param_desc_, scale->template data<T>(), bias->template data<T>(),
-          est_mean->template data<T>(), est_var->template data<T>(), epsilon));
+          bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+          bias->template data<BatchNormParamType<T>>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(), epsilon));
     } else {
       // Run training mode.
       // obtain running mean and running inv var, and see if we need to
@@ -159,11 +166,16 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
           data_desc_, x->template data<T>(), data_desc_,
           y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-          scale->template data<T>(), bias->template data<T>(), this_factor,
-          mean_out->template mutable_data<T>(ctx.GetPlace()),
-          variance_out->template mutable_data<T>(ctx.GetPlace()), epsilon,
-          saved_mean->template mutable_data<T>(ctx.GetPlace()),
-          saved_variance->template mutable_data<T>(ctx.GetPlace())));
+          scale->template data<BatchNormParamType<T>>(),
+          bias->template data<BatchNormParamType<T>>(), this_factor,
+          mean_out->template mutable_data<BatchNormParamType<T>>(
+              ctx.GetPlace()),
+          variance_out->template mutable_data<BatchNormParamType<T>>(
+              ctx.GetPlace()),
+          epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                       ctx.GetPlace()),
+          saved_variance->template mutable_data<BatchNormParamType<T>>(
+              ctx.GetPlace())));
     }
 
     // clean when exit.
@@ -270,9 +282,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    batch_norm,
-    ops::BatchNormKernel<paddle::platform::CUDADeviceContext, float>);
+    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
+    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CUDADeviceContext, float>);
+    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/box_coder_op.cc b/paddle/fluid/operators/box_coder_op.cc
index eccdd408a17a07a541480705242b137f8207c139..ec416f725e75fae57484751ee8a066c0b9da8a70 100644
--- a/paddle/fluid/operators/box_coder_op.cc
+++ b/paddle/fluid/operators/box_coder_op.cc
@@ -126,6 +126,7 @@ width and height.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
+REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
                        ops::BoxCoderKernel<double>);
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 72f8cb04f2de3af4ee526c3d9b86ff96e34f0b0a..dd0068d571f72c9c22334e523cd091fe4c8da5a6 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -88,4 +89,5 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                        ops::CastOpKernel<CPU, double>,
                        ops::CastOpKernel<CPU, int>,
                        ops::CastOpKernel<CPU, int64_t>,
-                       ops::CastOpKernel<CPU, bool>);
+                       ops::CastOpKernel<CPU, bool>,
+                       ops::CastOpKernel<CPU, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 507e9a531aae70e60bc6748bfab800310d6e0c21..c486c5850e25fcf4370f02cb145c244743a4cc4b 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 template <typename T>
 using CastOpKernel =
@@ -20,4 +21,5 @@ using CastOpKernel =
 
 REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
                         CastOpKernel<int>, CastOpKernel<int64_t>,
-                        CastOpKernel<bool>);
+                        CastOpKernel<bool>,
+                        CastOpKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 0ddbfdb4aa9e844adbb291e1c5612e96681831d6..a32aba4c1ff2f5e775aeb41f25b02322dbc6a64a 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -28,6 +28,8 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
+template <typename T>
+using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
     static_cast<size_t>(1024) * 1024 * 1024;
@@ -134,8 +136,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv forward ---------------------
-    typename platform::CudnnDataType<T>::ScalingParamType alpha = 1.0f,
-                                                          beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     for (int i = 0; i < groups; i++) {
       PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
           handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
@@ -282,8 +283,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
-    typename platform::CudnnDataType<T>::ScalingParamType alpha = 1.0f,
-                                                          beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index e3fc21c90f95469d646139a4454501d1c30bd51c..650bc92be22af9ea8afcacf590a11190109e8811 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -70,16 +70,16 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType ConvOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library_{framework::LibraryType::kPlain};
+  framework::LibraryType library{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kCUDNN;
+    library = framework::LibraryType::kCUDNN;
   }
 #endif
 #ifdef PADDLE_WITH_MKLDNN
-  if (library_ == framework::LibraryType::kPlain &&
+  if (library == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kMKLDNN;
+    library = framework::LibraryType::kMKLDNN;
   }
 #endif
 
@@ -91,15 +91,15 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
                     "input and filter data type should be consistent");
 
   if (input_data_type == framework::proto::VarType::FP16) {
-    PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
+    PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
                       "float16 can only be used when CUDNN is used");
   }
 
   std::string data_format = ctx.Attr<std::string>("data_format");
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
-                                 library_);
+  framework::DataLayout layout = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                 library);
 }
 
 Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index ec315695a68befc2e3de798fdb3fa146a903aaff..6da3a24dc89a85fe432b6350d3af7b0e84337c9d 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -78,7 +78,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
       for (int64_t i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
         int64_t index = i * class_num + label_data[i];
-        dx_data[index] = -dy_data[i] / x_data[index];
+        dx_data[index] = math::TolerableValue<T>()(-dy_data[i] / x_data[index]);
       }
     }
   }
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index 94395ccfbcbd74ee40552a5c70dc8b8063a5f851..2b19f0448955d2d7582f23ac133c14ffdf5c9e49 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -1,6 +1,8 @@
 if(WITH_DISTRIBUTE)
-  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+      grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc)
+  cc_test(serde_test SRCS test_serde.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
+      cares zlib protobuf sendrecvop_grpc)
 endif()
diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h
index 099deb12d0e436427c147ab9b1eb553b712e14fb..1791a48aab1b66147f645c90757b35ef5f6e001b 100644
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
@@ -23,9 +23,107 @@ limitations under the License. */
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 
+namespace grpc {
+// A ZeroCopyInputStream that reads from grpc_byte_buffer
+class GrpcBufferReader final
+    : public ::google::protobuf::io::ZeroCopyInputStream {
+  typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
+      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
+  typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
+      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
+  void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
+                  grpc_byte_buffer* buffer) {
+    (g_core_codegen_interface->*ptr)(reader, buffer);
+  }
+  void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
+                  grpc_byte_buffer* buffer) {
+    int result = (g_core_codegen_interface->*ptr)(reader, buffer);
+    (void)result;
+  }
+
+ public:
+  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
+      : byte_count_(0), backup_count_(0) {
+    ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
+               buffer);
+  }
+  ~GrpcBufferReader() override {
+    g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
+  }
+
+  bool Next(const void** data, int* size) override {
+    if (backup_count_ > 0) {
+      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
+              backup_count_;
+      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
+      *size = (int)backup_count_;
+      backup_count_ = 0;
+      return true;
+    }
+    if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
+                                                                &slice_)) {
+      return false;
+    }
+    g_core_codegen_interface->grpc_slice_unref(slice_);
+    *data = GRPC_SLICE_START_PTR(slice_);
+    // On win x64, int is only 32bit
+    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
+    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
+    return true;
+  }
+
+  void BackUp(int count) override { backup_count_ = count; }
+
+  bool Skip(int count) override {
+    const void* data;
+    int size;
+    while (Next(&data, &size)) {
+      if (size >= count) {
+        BackUp(size - count);
+        return true;
+      }
+      // size < count;
+      count -= size;
+    }
+    // error or we have too large count;
+    return false;
+  }
+
+  ::google::protobuf::int64 ByteCount() const override {
+    return byte_count_ - backup_count_;
+  }
+
+ private:
+  int64_t byte_count_;
+  int64_t backup_count_;
+  grpc_byte_buffer_reader reader_;
+  grpc_slice slice_;
+};
+
+};  // namespace grpc
+
 namespace paddle {
 namespace operators {
 namespace detail {
+// Source provides a way for a particular RPC implementation to provide
+// received data to ParseFrom.
+class Source {
+ public:
+  virtual ~Source() {}
+
+  // Return the stream that contains the data to be parsed.
+  // Note that this method might be invoked more than once if
+  // ParseFrom needs to fall back to a more expensive parsing method.
+  // Every call must return a stream pointing at the beginning of
+  // the serialized RecvTensorResponse.
+  //
+  // Note that a subsequent call to contents() invalidates previous
+  // results of contents().
+  //
+  // Ownership of the returned stream is retained by the Source and
+  // should not be deleted by the caller.
+  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
+};
 
 // A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
 class GrpcByteBufferSource
@@ -46,6 +144,43 @@ class GrpcByteBufferSource
   ::google::protobuf::int64 byte_count_;
 };
 
+class GrpcByteBufferSourceWrapper : public Source {
+ public:
+  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
+      : source_(source) {}
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+    return source_;
+  }
+
+ private:
+  GrpcByteBufferSource* source_;
+};
+
+class GrpcByteSource : public Source {
+ public:
+  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
+  ~GrpcByteSource() override { DeleteStream(); }
+
+  typedef ::grpc::GrpcBufferReader Reader;
+
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+    DeleteStream();
+    stream_ = new (&space_) Reader(buffer_);
+    return stream_;
+  }
+
+ private:
+  void DeleteStream() {
+    if (stream_) {
+      stream_->~Reader();
+    }
+  }
+
+  grpc_byte_buffer* buffer_;  // Not owned
+  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
+  char space_[sizeof(Reader)];
+};
+
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index ddeeebec58e02f1686fd2e3d3e5ac1a4c4fd3c59..e73bbe7537a9b37d358a5aa4a076032b57fca513 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "grpc_client.h"
+#include <sys/time.h>
 #include "paddle/fluid/framework/threadpool.h"
+
 namespace paddle {
 namespace operators {
 namespace detail {
@@ -31,8 +33,9 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
 
   framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
     auto* var = p_scope->FindVar(var_name_val);
-    sendrecv::VariableMessage req;
-    SerializeToMessage(var_name_val, var, *p_ctx, &req);
+
+    ::grpc::ByteBuffer req;
+    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
 
     // varhandle
     VarHandle var_h;
@@ -46,8 +49,10 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
     s->Prepare(var_h, time_out);
     s->response_call_back_ = NULL;
 
-    auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, (void*)s);
   });
 
   req_count_++;
@@ -56,9 +61,19 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
 }
 
 void ProcGetResponse(const VarHandle& var_h,
-                     const sendrecv::VariableMessage& ret_msg) {
-  auto* outvar = var_h.scope->FindVar(var_h.name);
-  DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
+                     // const sendrecv::VariableMessage& ret_msg) {
+                     const ::grpc::ByteBuffer& ret_msg) {
+  framework::Variable* outvar = NULL;
+  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, outvar);
+}
+
+template <typename T>
+void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
+  ::grpc::Slice slice(proto.ByteSizeLong());
+  proto.SerializeWithCachedSizesToArray(
+      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(slice.begin())));
+  ::grpc::ByteBuffer tmp(&slice, 1);
+  result->Swap(&tmp);
 }
 
 bool RPCClient::AsyncGetVariable(const std::string& ep,
@@ -88,8 +103,13 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
     s->Prepare(var_h, time_out);
     s->response_call_back_ = ProcGetResponse;
 
-    auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+    ::grpc::ByteBuffer buf;
+    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
+
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, (void*)s);
   });
 
   req_count_++;
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
index f520367dd981288416631fdad15241fb5d811d07..8216ac52fbbb3dcd2f30957cde58a850a77b08d6 100644
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -25,6 +25,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include <grpc++/generic/generic_stub.h>
+#include <grpc++/grpc++.h>
+#include <grpc++/support/byte_buffer.h>
+#include <grpc++/support/slice.h>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -49,15 +54,11 @@ struct VarHandle {
   }
 };
 
-void ProcGetResponse(const VarHandle& var_h,
-                     const sendrecv::VariableMessage& msg);
+void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
 
 class BaseProcessor {
  public:
-  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-    context_ = NULL;
-  }
+  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) { context_ = NULL; }
 
   virtual ~BaseProcessor() {}
 
@@ -82,19 +83,18 @@ class BaseProcessor {
 
   virtual void Process() = 0;
 
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
   std::unique_ptr<grpc::ClientContext> context_;
   grpc::Status status_;
   VarHandle var_h_;
 };
 
-typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)>
+typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
     RequestSendCallBack;
 
 class SendProcessor : public BaseProcessor {
  public:
   explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
+      : BaseProcessor(ch), stub_g_(ch) {}
 
   virtual ~SendProcessor() {}
 
@@ -104,17 +104,18 @@ class SendProcessor : public BaseProcessor {
     }
   }
 
-  sendrecv::VoidMessage reply_;
+  ::grpc::GenericStub stub_g_;
+  ::grpc::ByteBuffer reply_;
   RequestSendCallBack response_call_back_ = NULL;
 };
 
-typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)>
+typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
     RequestGetCallBack;
 
 class GetProcessor : public BaseProcessor {
  public:
   explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
+      : BaseProcessor(ch), stub_g_(ch) {}
 
   virtual ~GetProcessor() {}
 
@@ -124,30 +125,37 @@ class GetProcessor : public BaseProcessor {
     }
   }
 
-  sendrecv::VariableMessage reply_;
+  ::grpc::ByteBuffer reply_;
+  ::grpc::GenericStub stub_g_;
   RequestGetCallBack response_call_back_ = ProcGetResponse;
 };
 
 class BatchBarrierProcessor : public BaseProcessor {
  public:
   explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
+      : BaseProcessor(ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+  }
 
   virtual ~BatchBarrierProcessor() {}
 
   virtual void Process() {}
   sendrecv::VoidMessage reply_;
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
 
 class FetchBarrierProcessor : public BaseProcessor {
  public:
   explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
+      : BaseProcessor(ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+  }
 
   virtual ~FetchBarrierProcessor() {}
 
   virtual void Process() {}
   sendrecv::VariableMessage reply_;
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
 
 class RPCClient {
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 8fff430cc4890925e4edba2fadb8eb7fc647d181..9691d1e86b111def5b82e022dd01795aaf5c7b0d 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/detail/grpc_server.h"
 
-using grpc::ServerAsyncResponseWriter;
+using ::grpc::ServerAsyncResponseWriter;
 
 namespace paddle {
 namespace operators {
@@ -26,9 +26,10 @@ enum CallStatus { PROCESS = 0, FINISH };
 // https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
 class RequestBase {
  public:
-  explicit RequestBase(sendrecv::SendRecvService::AsyncService* service,
-                       grpc::ServerCompletionQueue* cq)
-      : service_(service), cq_(cq), status_(PROCESS) {
+  explicit RequestBase(GrpcService::AsyncService* service,
+                       ::grpc::ServerCompletionQueue* cq,
+                       const platform::DeviceContext* dev_ctx)
+      : service_(service), cq_(cq), status_(PROCESS), dev_ctx_(dev_ctx) {
     PADDLE_ENFORCE(cq_);
   }
   virtual ~RequestBase() {}
@@ -42,55 +43,58 @@ class RequestBase {
   }
 
  protected:
-  grpc::ServerContext ctx_;
-  sendrecv::SendRecvService::AsyncService* service_;
-  grpc::ServerCompletionQueue* cq_;
+  ::grpc::ServerContext ctx_;
+  GrpcService::AsyncService* service_;
+  ::grpc::ServerCompletionQueue* cq_;
   CallStatus status_;
+  const platform::DeviceContext* dev_ctx_;
 };
 
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
-
 class RequestSend final : public RequestBase {
  public:
-  explicit RequestSend(sendrecv::SendRecvService::AsyncService* service,
-                       grpc::ServerCompletionQueue* cq,
-                       SimpleBlockQueue<MessageWithName>* queue)
-      : RequestBase(service, cq), queue_(queue), responder_(&ctx_) {
-    service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_,
-                                  this);
+  explicit RequestSend(GrpcService::AsyncService* service,
+                       ::grpc::ServerCompletionQueue* cq,
+                       framework::Scope* scope, ReceivedQueue* queue,
+                       const platform::DeviceContext* dev_ctx)
+      : RequestBase(service, cq, dev_ctx), queue_(queue), responder_(&ctx_) {
+    request_.reset(new VariableResponse(scope, dev_ctx_));
+    int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
+    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
+                                cq_, cq_, this);
   }
 
   virtual ~RequestSend() {}
 
-  virtual std::string GetReqName() { return request_.varname(); }
+  virtual std::string GetReqName() { return request_->Varname(); }
 
   virtual void Process() {
-    MessageWithName msg_with_name =
-        std::make_pair(request_.varname(), std::move(request_));
-    queue_->Push(std::move(msg_with_name));
-    responder_.Finish(reply_, grpc::Status::OK, this);
+    queue_->Push(std::make_pair(request_->Varname(), request_));
+
+    sendrecv::VoidMessage reply;
+    responder_.Finish(reply, ::grpc::Status::OK, this);
     status_ = FINISH;
   }
 
  protected:
-  sendrecv::VariableMessage request_;
-  sendrecv::VoidMessage reply_;
-  SimpleBlockQueue<MessageWithName>* queue_;
+  std::shared_ptr<VariableResponse> request_;
+  ReceivedQueue* queue_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
 
 class RequestGet final : public RequestBase {
  public:
-  explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
-                      grpc::ServerCompletionQueue* cq, framework::Scope* scope,
+  explicit RequestGet(GrpcService::AsyncService* service,
+                      ::grpc::ServerCompletionQueue* cq,
+                      framework::Scope* scope,
                       const platform::DeviceContext* dev_ctx,
                       SimpleBlockQueue<MessageWithName>* queue)
-      : RequestBase(service, cq),
+      : RequestBase(service, cq, dev_ctx),
         responder_(&ctx_),
         scope_(scope),
-        dev_ctx_(dev_ctx),
         queue_(queue) {
-    service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
+    int method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
+    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
+                                cq_, this);
   }
 
   virtual ~RequestGet() {}
@@ -101,24 +105,26 @@ class RequestGet final : public RequestBase {
     // proc request.
     std::string var_name = request_.varname();
     auto* var = scope_->FindVar(var_name);
+
+    ::grpc::ByteBuffer reply;
     if (var_name != FETCH_BARRIER_MESSAGE) {
-      SerializeToMessage(var_name, var, *dev_ctx_, &reply_);
+      SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
     }
-    // TODO(gongwb): check var's info.
-    responder_.Finish(reply_, grpc::Status::OK, this);
+
+    responder_.Finish(reply, ::grpc::Status::OK, this);
     status_ = FINISH;
-    MessageWithName msg_with_name =
-        //          request name    reply
-        std::make_pair(var_name, std::move(reply_));
-    queue_->Push(msg_with_name);
+
+    if (var_name == FETCH_BARRIER_MESSAGE) {
+      sendrecv::VariableMessage msg;
+      MessageWithName msg_with_name = std::make_pair(var_name, msg);
+      queue_->Push(msg_with_name);
+    }
   }
 
  protected:
   sendrecv::VariableMessage request_;
-  sendrecv::VariableMessage reply_;
-  ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
   framework::Scope* scope_;
-  const platform::DeviceContext* dev_ctx_;
   SimpleBlockQueue<MessageWithName>* queue_;
 };
 
@@ -133,8 +139,8 @@ void AsyncGRPCServer::WaitClientGet(int count) {
 }
 
 void AsyncGRPCServer::RunSyncUpdate() {
-  grpc::ServerBuilder builder;
-  builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
+  ::grpc::ServerBuilder builder;
+  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials());
   builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
   builder.RegisterService(&service_);
@@ -182,8 +188,8 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
   if (is_shut_down_) {
     return;
   }
-  RequestSend* send =
-      new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
+  RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_,
+                                      &var_recv_queue_, dev_ctx_);
   VLOG(4) << "Create RequestSend status:" << send->Status();
 }
 
@@ -198,7 +204,7 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
 }
 
 // FIXME(typhoonzero): change cq_name to enum.
-void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq,
+void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
                                     std::string cq_name,
                                     std::function<void()> TryToRegisterNewOne) {
   TryToRegisterNewOne();
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index b6666bcf96e484b0b17b935c0efb2930f19b19f2..10e6dd45a901d36de4a6577db4da05551645eb73 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -14,28 +14,31 @@ limitations under the License. */
 
 #pragma once
 
+#include <grpc++/grpc++.h>
+#include <thread>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-
+#include "paddle/fluid/operators/detail/grpc_service.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
-
-#include <grpc++/grpc++.h>
-#include <grpc/support/log.h>
-#include <thread>
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
 
 namespace paddle {
 namespace operators {
 namespace detail {
 
+typedef std::pair<std::string, std::shared_ptr<VariableResponse>>
+    ReceivedMessage;
+typedef SimpleBlockQueue<ReceivedMessage> ReceivedQueue;
+
 typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
 class RequestBase;
 
-class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
+class AsyncGRPCServer final {
  public:
   explicit AsyncGRPCServer(const std::string &address) : address_(address) {}
 
@@ -50,14 +53,16 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
 
   void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
 
-  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
+  const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
 
-  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
+  void Push(const std::string &msg_name) {
+    this->var_recv_queue_.Push(std::make_pair(msg_name, nullptr));
+  }
 
   void ShutDown();
 
  protected:
-  void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name,
+  void HandleRequest(::grpc::ServerCompletionQueue *cq, std::string cq_name,
                      std::function<void()> TryToRegisterNewOne);
   void TryToRegisterNewSendOne();
   void TryToRegisterNewGetOne();
@@ -66,18 +71,19 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
  private:
   std::mutex cq_mutex_;
   volatile bool is_shut_down_ = false;
-  std::unique_ptr<grpc::ServerCompletionQueue> cq_send_;
-  std::unique_ptr<grpc::ServerCompletionQueue> cq_get_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
 
-  sendrecv::SendRecvService::AsyncService service_;
-  std::unique_ptr<grpc::Server> server_;
+  GrpcService::AsyncService service_;
+  std::unique_ptr<::grpc::Server> server_;
 
   std::string address_;
   framework::Scope *scope_;
   const platform::DeviceContext *dev_ctx_;
+
   // received variable from RPC, operators fetch variable from this queue.
-  SimpleBlockQueue<MessageWithName> var_recv_queue_;
   SimpleBlockQueue<MessageWithName> var_get_queue_;
+  ReceivedQueue var_recv_queue_;
 
   // condition of the sub program
   std::mutex barrier_mutex_;
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae6f9db3bd31a4b4839b34e8e53dd87f1ecf4b1d
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -0,0 +1,118 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <grpc++/impl/codegen/async_stream.h>
+#include <grpc++/impl/codegen/async_unary_call.h>
+#include <grpc++/impl/codegen/proto_utils.h>
+#include <grpc++/impl/codegen/rpc_method.h>
+#include <grpc++/impl/codegen/service_type.h>
+#include <grpc++/impl/codegen/status.h>
+#include <grpc++/impl/codegen/stub_options.h>
+#include <grpc++/impl/codegen/sync_stream.h>
+#include <grpc++/support/byte_buffer.h>
+#include "paddle/fluid/operators/detail/variable_response.h"
+
+// NOTE: This method was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       method and did some modifications so that we can parse gRPC
+//       requests without too much copying of the tensor data.
+
+namespace grpc {
+class CompletionQueue;
+class Channel;
+class RpcService;
+class ServerCompletionQueue;
+class ServerContext;
+
+// Support parsing/unparsing of tensorflow::VariableResponse.
+// Wire-format is identical to RecvVariableResponse.
+template <>
+class SerializationTraits<paddle::operators::detail::VariableResponse> {
+ public:
+  static Status Serialize(
+      const paddle::operators::detail::VariableResponse& msg,
+      grpc_byte_buffer** bp, bool* own_buffer) {
+    PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
+    return Status();
+  }
+  static Status Deserialize(grpc_byte_buffer* buffer,
+                            paddle::operators::detail::VariableResponse* msg,
+                            int max_message_size = INT_MAX) {
+    if (buffer == nullptr) {
+      return Status(StatusCode::INTERNAL, "No payload");
+    }
+
+    Status result = g_core_codegen_interface->ok();
+    if (result.ok()) {
+      paddle::operators::detail::GrpcByteSource source(buffer);
+      int ret = msg->Parse(&source);
+      if (ret != 0) {
+        result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
+      }
+    }
+    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
+    return result;
+  }
+};
+}  // namespace grpc
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+enum class GrpcMethod {
+  kSendVariable,
+  kGetVariable,
+};
+
+static const int kGrpcNumMethods =
+    static_cast<int>(GrpcMethod::kGetVariable) + 1;
+
+inline const char* GrpcMethodName(GrpcMethod id) {
+  switch (id) {
+    case GrpcMethod::kSendVariable:
+      return "/sendrecv.SendRecvService/SendVariable";
+    case GrpcMethod::kGetVariable:
+      return "/sendrecv.SendRecvService/GetVariable";
+  }
+
+  // Shouldn't be reached.
+  PADDLE_ENFORCE(false, "Invalid id: not found valid method name");
+  return nullptr;
+}
+
+class GrpcService final {
+ public:
+  class AsyncService : public ::grpc::Service {
+   public:
+    AsyncService() {
+      for (int i = 0; i < kGrpcNumMethods; ++i) {
+        AddMethod(new ::grpc::internal::RpcServiceMethod(
+            GrpcMethodName(static_cast<GrpcMethod>(i)),
+            ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
+        ::grpc::Service::MarkMethodAsync(i);
+      }
+    }
+    virtual ~AsyncService() {}
+
+    // Make RequestAsyncUnary public for grpc_call.h
+    using ::grpc::Service::RequestAsyncUnary;
+  };
+};
+
+}  // namespace detail
+}  // namespace operator
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index b0215d4a80c9440f09c35434903fd6166b03e8b0..598aaa4c51a6c5cd32eeffe08bbae849aee1a1df 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -32,6 +32,9 @@ enum VarType {
   SELECTED_ROWS = 1;
 }
 
+// NOTICE(gongwb):don't modify this proto if you are not
+//   not familar with how we serialize in sendrecvop_utils.h
+//   and deserilize it in  variable_response.h.
 message VariableMessage {
   enum Type {
     // Pod Types
@@ -45,7 +48,6 @@ message VariableMessage {
   }
 
   message LodData { repeated int64 lod_data = 1; }
-
   string varname = 1;
   // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
   VarType type = 2;
@@ -64,3 +66,5 @@ message VariableMessage {
 }
 
 message VoidMessage {}
+
+message TestMessage { int64 test_1 = 1; }
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index 39117eeeb611b025c426938c60ddf82c6af232ca..d7bbf79c50651943d91c38bbaab775f5ee8dc395 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -13,61 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include <sys/time.h>
+#include <thread>
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/detail/bytebuffer_stream.h"
 #include "paddle/fluid/operators/detail/proto_encoder_helper.h"
+#include "paddle/fluid/operators/detail/variable_response.h"
 
 namespace paddle {
 namespace operators {
 namespace detail {
 
-void SerializeToMessage(const std::string& name, const framework::Variable* var,
-                        const platform::DeviceContext& ctx,
-                        sendrecv::VariableMessage* msg) {
-  msg->set_varname(name);
-  std::ostringstream oss;
-  switch (framework::ToVarType(var->Type())) {
-    case framework::proto::VarType_Type_LOD_TENSOR:
-      msg->set_type(sendrecv::VarType::LOD_TENSOR);
-      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
-      break;
-    case framework::proto::VarType_Type_SELECTED_ROWS:
-      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
-      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
-                                   ctx);
-      break;
-    default: {
-      PADDLE_THROW("Serialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-  msg->set_serialized(oss.str());
-}
-
-void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
-                            const platform::DeviceContext& ctx,
-                            framework::Variable* var) {
-  std::istringstream iss(msg.serialized());
-  switch (msg.type()) {
-    case sendrecv::VarType::LOD_TENSOR:
-      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
-      break;
-    case sendrecv::VarType::SELECTED_ROWS: {
-      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
-                            ctx);
-      break;
-    }
-    default: {
-      PADDLE_THROW("Deserialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-}
-
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg) {
@@ -123,6 +81,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
             static_cast<const platform::CUDADeviceContext&>(ctx);
         auto copy_size = tensor.memory_size();
         payload = memory::Alloc(cpu, copy_size);
+
         memory::Copy(cpu, payload,
                      boost::get<platform::CUDAPlace>(tensor.place()),
                      reinterpret_cast<const void*>(tensor.data<void>()),
@@ -132,6 +91,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
           platform::CPUPlace cpu;
           memory::Free(cpu, backing);
         };
+
 #endif
       } else {
         payload = tensor.data<void>();
@@ -219,80 +179,11 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
 
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
-                               framework::Variable* var) {
-  sendrecv::VariableMessage meta;
-  GrpcByteBufferSource source;
-  source.Init(msg);
-  ::google::protobuf::io::CodedInputStream input(&source);
-  // do zerocopy parsing
-  PADDLE_ENFORCE(meta.ParseFromCodedStream(&input));
-  PADDLE_ENFORCE(input.ConsumedEntireMessage());
-  // dims is needed by both tensor and selectedrows
-  std::vector<int> vecdims;
-  for (auto& d : meta.dims()) {
-    vecdims.push_back(d);
-  }
-  framework::DDim dims = framework::make_ddim(vecdims);
-
-  if (meta.type() == sendrecv::LOD_TENSOR) {
-    auto* tensor = var->GetMutable<framework::LoDTensor>();
-    tensor->Resize(dims);
-    void* tensor_data = tensor->mutable_data(
-        ctx.GetPlace(),
-        paddle::operators::detail::ToTypeIndex(meta.data_type()));
-    framework::LoD lod;
-    for (int i = 0; i < meta.lod_level(); ++i) {
-      framework::Vector<size_t> v;
-      for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) {
-        v.push_back(meta.lod(i).lod_data(j));
-      }
-      lod.push_back(v);
-    }
-    tensor->set_lod(lod);
-    // How to avoid copying and use the message buffer directly?
-    // Maybe need to find a way to release all memory except tensor content.
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      platform::CPUPlace cpu;
-      auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
-      memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
-                   tensor_data, cpu,
-                   reinterpret_cast<const void*>(meta.serialized().data()),
-                   meta.serialized().size(), gpu_dev_ctx.stream());
-      ctx.Wait();
-#endif
-    } else {
-      memcpy(tensor_data,
-             reinterpret_cast<const void*>(meta.serialized().data()),
-             meta.serialized().size());
-    }
-  } else if (meta.type() == sendrecv::SELECTED_ROWS) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    auto* tensor = slr->mutable_value();
-    int64_t* rows_data = slr->mutable_rows()->data();
-    tensor->Resize(dims);
-    void* tensor_data = tensor->mutable_data(
-        ctx.GetPlace(),
-        paddle::operators::detail::ToTypeIndex(meta.data_type()));
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      platform::CPUPlace cpu;
-      auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
-      memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
-                   tensor_data, cpu,
-                   reinterpret_cast<const void*>(meta.serialized().data()),
-                   meta.serialized().size(), gpu_dev_ctx.stream());
-      ctx.Wait();
-#endif
-    } else {
-      memcpy(tensor_data,
-             reinterpret_cast<const void*>(meta.serialized().data()),
-             meta.serialized().size());
-    }
-    // copy rows CPU data, GPU data will be copied lazly
-    memcpy(rows_data, reinterpret_cast<const void*>(meta.rows().data()),
-           meta.rows().size());
-  }
+                               const framework::Scope* scope,
+                               framework::Variable*& var) {
+  operators::detail::VariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
+  var = resp.GetVar();
 }
 
 }  // namespace detail
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h
index 4fa6aefd3e0b1bd45ac52b1eff3b29126d79f03a..3b875627032a6b08cc70280b3cc825c2a703923f 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
 
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
@@ -36,21 +37,14 @@ namespace detail {
 
 typedef void (*DestroyCallback)(void*);
 
-void SerializeToMessage(const std::string& name, const framework::Variable* var,
-                        const platform::DeviceContext& ctx,
-                        sendrecv::VariableMessage* msg);
-
-void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
-                            const platform::DeviceContext& ctx,
-                            framework::Variable* var);
-
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg);
 
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
-                               framework::Variable* var);
+                               const framework::Scope* scope,
+                               framework::Variable*& var);
 
 inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
   switch (type) {
diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc
index 2f06e5a686b996858d21930a1afa2861efca4a9b..e646c894d18d37f5343a10df2542a0e46ab13372 100644
--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/test_serde.cc
@@ -16,11 +16,13 @@ limitations under the License. */
 #include <string>
 #include <thread>
 
+#include <google/protobuf/text_format.h>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
@@ -31,19 +33,21 @@ namespace operators = paddle::operators;
 namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
 
-void RunSerdeTestTensor(platform::Place place) {
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* tensor = var.GetMutable<framework::LoDTensor>();
-  tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
-  framework::LoD lod;
-  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-  tensor->set_lod(lod);
-  int tensor_numel = 4 * 8 * 4 * 2;
+void RunSerdeTestSelectedRows(platform::Place place) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& ctx = *pool.Get(place);
+
+  // serialize var to ByteBuffer
+  framework::Variable var;
+  auto* slr = var.GetMutable<framework::SelectedRows>();
+  auto* tensor = slr->mutable_value();
+  auto* rows = slr->mutable_rows();
+  tensor->Resize(framework::make_ddim({2, 10}));
   tensor->mutable_data<float>(place);
-  math::set_constant(ctx, tensor, 31.9);
+  int tensor_numel = 2 * 10;
+  math::set_constant(ctx, tensor, 32.7);
+  rows->push_back(3);
+  rows->push_back(10);
 
   ::grpc::ByteBuffer msg;
   operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
@@ -56,62 +60,67 @@ void RunSerdeTestTensor(platform::Place place) {
   for (const auto& s : slices) {
     tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
   }
+
   sendrecv::VariableMessage varmsg;
   EXPECT_TRUE(varmsg.ParseFromString(tmp));
+
   EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 0);
-  EXPECT_EQ(varmsg.dims()[0], 4);
-  EXPECT_EQ(varmsg.dims()[1], 8);
-  EXPECT_EQ(varmsg.dims()[2], 4);
-  EXPECT_EQ(varmsg.dims()[3], 2);
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
+  EXPECT_EQ(varmsg.type(), 1);
 
   const float* tensor_data =
       reinterpret_cast<const float*>(varmsg.serialized().data());
+  const int64_t* rows_data =
+      reinterpret_cast<const int64_t*>(varmsg.rows().data());
   for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
+    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
   }
-
+  EXPECT_EQ(rows_data[0], 3);
+  EXPECT_EQ(rows_data[1], 10);
   // deserialize zero-copy
-  framework::Variable var2;
-  operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
-  auto tensor2 = var2.Get<framework::LoDTensor>();
+  // framework::Variable var2;
+  // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
+  framework::Scope scope;
+  scope.Var("myvar");
+  operators::detail::VariableResponse resp(&scope, &ctx);
+  EXPECT_EQ(resp.Parse(msg), 0);
+
+  framework::Variable* var2 = resp.GetVar();
+
+  auto* slr2 = var2->GetMutable<framework::SelectedRows>();
+  auto* tensor2 = slr2->mutable_value();
+  auto* rows2 = slr2->mutable_rows();
   float* tensor_data2 = nullptr;
   framework::Tensor tmp_tensor;
 
   if (platform::is_gpu_place(ctx.GetPlace())) {
     platform::CPUPlace cpu;
-    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
+    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
     tensor_data2 = tmp_tensor.data<float>();
   } else {
-    tensor_data2 = const_cast<float*>(tensor2.data<float>());
+    tensor_data2 = const_cast<float*>(tensor2->data<float>());
   }
+  const int64_t* rows_data2 = rows2->data();
 
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
+  for (int i = 0; i < tensor_numel; ++i) {
+    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
+  }
+  EXPECT_EQ(rows_data2[0], 3);
+  EXPECT_EQ(rows_data2[1], 10);
 }
 
-void RunSerdeTestSelectedRows(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
+void RunTestLodTensor(platform::Place place, int from_type = 0) {
   // serialize var to ByteBuffer
   framework::Variable var;
-  auto* slr = var.GetMutable<framework::SelectedRows>();
-  auto* tensor = slr->mutable_value();
-  auto* rows = slr->mutable_rows();
-  tensor->Resize(framework::make_ddim({2, 10}));
+  auto* tensor = var.GetMutable<framework::LoDTensor>();
+  tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
+  framework::LoD lod;
+  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
+  tensor->set_lod(lod);
+  int tensor_numel = 4 * 8 * 4 * 2;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
   tensor->mutable_data<float>(place);
-  int tensor_numel = 2 * 10;
-  math::set_constant(ctx, tensor, 32.7);
-  rows->push_back(3);
-  rows->push_back(10);
+  math::set_constant(ctx, tensor, 31.9);
 
   ::grpc::ByteBuffer msg;
   operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
@@ -126,61 +135,82 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   }
   sendrecv::VariableMessage varmsg;
   EXPECT_TRUE(varmsg.ParseFromString(tmp));
-
   EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 1);
+  EXPECT_EQ(varmsg.type(), 0);
+  EXPECT_EQ(varmsg.dims()[0], 4);
+  EXPECT_EQ(varmsg.dims()[1], 8);
+  EXPECT_EQ(varmsg.dims()[2], 4);
+  EXPECT_EQ(varmsg.dims()[3], 2);
+  EXPECT_EQ(varmsg.lod_level(), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
+  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
 
   const float* tensor_data =
       reinterpret_cast<const float*>(varmsg.serialized().data());
-  const int64_t* rows_data =
-      reinterpret_cast<const int64_t*>(varmsg.rows().data());
   for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
+    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
   }
-  EXPECT_EQ(rows_data[0], 3);
-  EXPECT_EQ(rows_data[1], 10);
+
+  // message binary
+  std::string str;
+  varmsg.SerializeToString(&str);
+
+  // message bytebuffer
+  ::grpc::Slice slices_2[1];
+  int num_slices = 1;
+  slices_2[0] = ::grpc::Slice(str.length());
+  memcpy(const_cast<uint8_t*>(slices_2[0].begin()), str.c_str(), str.length());
+  ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices);
+
   // deserialize zero-copy
-  framework::Variable var2;
-  operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
+  framework::Scope scope;
+  scope.Var("myvar");
+  operators::detail::VariableResponse resp(&scope, &ctx);
+  if (from_type == 0) {
+    EXPECT_EQ(resp.Parse(msg), 0);
+  } else {
+    EXPECT_EQ(resp.Parse(bytebuffer2), 0);
+  }
 
-  auto* slr2 = var2.GetMutable<framework::SelectedRows>();
-  auto* tensor2 = slr2->mutable_value();
-  auto* rows2 = slr2->mutable_rows();
+  framework::Variable* var2 = resp.GetVar();
+
+  auto tensor2 = var2->Get<framework::LoDTensor>();
   float* tensor_data2 = nullptr;
   framework::Tensor tmp_tensor;
 
   if (platform::is_gpu_place(ctx.GetPlace())) {
     platform::CPUPlace cpu;
-    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
+    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
     tensor_data2 = tmp_tensor.data<float>();
   } else {
-    tensor_data2 = const_cast<float*>(tensor2->data<float>());
+    tensor_data2 = const_cast<float*>(tensor2.data<float>());
   }
-  const int64_t* rows_data2 = rows2->data();
 
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-  }
-  EXPECT_EQ(rows_data2[0], 3);
-  EXPECT_EQ(rows_data2[1], 10);
+  EXPECT_EQ(varmsg.lod_level(), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
+  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
+  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
 }
 
-TEST(SelectedRows, CPU) {
+TEST(LodTensor, Run) {
   platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
+  RunTestLodTensor(place);
+  RunTestLodTensor(place, 1);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu(0);
+  RunTestLodTensor(gpu);
+  RunTestLodTensor(gpu, 1);
+#endif
 }
 
-TEST(SelectedRows, GPU) {
-  platform::CUDAPlace place;
+TEST(SelectedRows, Run) {
+  platform::CPUPlace place;
   RunSerdeTestSelectedRows(place);
-}
 
-TEST(Tensor, CPU) {
-  platform::CPUPlace place;
-  RunSerdeTestTensor(place);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu;
+  RunSerdeTestSelectedRows(gpu);
+#endif
 }
-
-TEST(Tensor, GPU) {
-  platform::CUDAPlace place;
-  RunSerdeTestTensor(place);
-}
\ No newline at end of file
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12e8eb0b4da2252b104415aef4156bf100c3e565
--- /dev/null
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -0,0 +1,400 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/variable_response.h"
+#include <string.h>
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+enum WireType {
+  WIRETYPE_VARINT = 0,
+  WIRETYPE_LENGTH_DELIMITED = 2,
+};
+
+inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
+
+inline WireType GetTagWireType(uint32_t tag) {
+  return static_cast<WireType>(tag & 0x7);
+}
+
+bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
+                         int* result) {
+  uint64_t v;
+  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
+    *result = static_cast<int>(v);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
+             const platform::DeviceContext& dev_ctx, platform::Place place,
+             void* dest, int size) {
+  const void* data = NULL;
+  int size_to_write = 0;
+
+  if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+    auto& gpu_dev_ctx =
+        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+    platform::CPUPlace cpu;
+
+    char* p = reinterpret_cast<char*>(dest);
+    while (size > 0) {
+      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+        return false;
+      }
+
+      memory::Copy(boost::get<platform::CUDAPlace>(place),
+                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
+                   gpu_dev_ctx.stream());
+      p += size_to_write;
+      size -= size_to_write;
+
+      input->Skip(size_to_write);
+    }
+    gpu_dev_ctx.Wait();
+#else
+    PADDLE_THROW("Unexpected branch");
+#endif
+    return true;
+  }
+
+  char* p = reinterpret_cast<char*>(dest);
+  while (size > 0) {
+    if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+      return false;
+    }
+    // TODO(gongwb): can we avoid copy?
+    platform::CPUPlace cpu;
+    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
+
+    p += size_to_write;
+    size -= size_to_write;
+
+    input->Skip(size_to_write);
+  }
+
+  return true;
+}
+
+bool VariableResponse::CopyLodTensorData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
+  auto var = scope_->FindVar(meta_.varname());
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize(dims);
+
+  framework::LoD lod;
+  for (int i = 0; i < meta_.lod_level(); ++i) {
+    framework::Vector<size_t> v;
+    for (int j = 0; j < meta_.lod(i).lod_data_size(); ++j) {
+      v.push_back(meta_.lod(i).lod_data(j));
+    }
+    lod.push_back(v);
+  }
+  tensor->set_lod(lod);
+
+  void* tensor_data =
+      tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
+
+  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
+    return false;
+  }
+
+  return true;
+}
+
+inline framework::DDim GetDims(
+    const ::google::protobuf::RepeatedField<::google::protobuf::int64>& dims) {
+  std::vector<int> vecdims;
+  for (auto& d : dims) {
+    vecdims.push_back(d);
+  }
+  return framework::make_ddim(vecdims);
+}
+
+bool VariableResponse::CopySelectRowsTensorData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
+  auto var = scope_->FindVar(meta_.varname());
+  auto* slr = var->GetMutable<framework::SelectedRows>();
+  auto* tensor = slr->mutable_value();
+  tensor->Resize(dims);
+  void* tensor_data = tensor->mutable_data(
+      ctx.GetPlace(),
+      paddle::operators::detail::ToTypeIndex(meta_.data_type()));
+
+  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool VariableResponse::CopySelectRowsData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, int length) {
+  auto var = scope_->FindVar(meta_.varname());
+  auto* slr = var->GetMutable<framework::SelectedRows>();
+  int64_t* rows_data = slr->mutable_rows()->data();
+
+  // copy rows CPU data, GPU data will be copied lazily.
+  platform::CPUPlace cpu;
+  if (!ReadRaw(input, ctx, cpu, rows_data, length)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
+                  std::vector<int64_t>* lod) {
+  while (true) {
+    auto p = input->ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+
+    if (!p.second) {
+      return (tag == 0);
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
+        uint64_t v;
+        if (wt == WIRETYPE_VARINT) {
+          if (!input->ReadVarint64(&v)) {
+            return false;
+          }
+          lod->push_back(v);
+          break;
+        }
+
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int length = 0;
+          if (!input->ReadVarintSizeAsInt(&length)) {
+            return tag;
+          }
+
+          for (int i = 0; i < length; i++) {
+            uint64_t v;
+            if (!input->ReadVarint64(&v)) {
+              return false;
+            }
+            lod->push_back(v);
+          }
+          break;
+        }
+
+        return false;
+      }
+      default: { return false; }
+    }
+  }
+
+  return true;
+}
+
+int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
+  GrpcByteBufferSource source;
+  source.Init(byte_buffer);
+  GrpcByteBufferSourceWrapper r(&source);
+
+  return Parse(&r);
+}
+
+int VariableResponse::Parse(Source* source) {
+  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
+      source->contents();
+  ::google::protobuf::io::CodedInputStream input(input_stream);
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
+
+  while (true) {
+    auto p = input.ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+    if (!p.second) {
+      if (tag != 0) {
+        return -1;
+      }
+
+      return 0;
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage::kVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kTypeFieldNumber: {
+        uint64_t v;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+
+        meta_.set_type(static_cast<::sendrecv::VarType>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+
+        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDimsFieldNumber: {
+        // not packed
+        if (wt == WIRETYPE_VARINT) {
+          uint64_t v;
+          if (!input.ReadVarint64(&v)) {
+            return tag;
+          }
+          meta_.add_dims(v);
+          break;
+        }
+
+        // packed
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int length = 0;
+          if (!input.ReadVarintSizeAsInt(&length)) {
+            return tag;
+          }
+          for (int i = 0; i < length; i++) {
+            uint64_t v;
+            if (!input.ReadVarint64(&v)) {
+              return tag;
+            }
+            meta_.add_dims(v);
+          }
+          break;
+        }
+
+        return tag;
+      }
+      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_lod_level(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kLodFieldNumber: {
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+
+        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
+            input.IncrementRecursionDepthAndPushLimit(length);
+
+        std::vector<int64_t> lod_data;
+        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
+          return tag;
+        }
+
+        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
+          return false;
+        }
+
+        if (lod_data.size() == 0) {
+          break;
+        }
+
+        auto lod = meta_.add_lod();
+        for (uint32_t i = 0; i < lod_data.size(); i++) {
+          lod->add_lod_data(lod_data[i]);
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kSerializedFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+
+        framework::DDim dims = GetDims(meta_.dims());
+        if (meta_.type() == sendrecv::LOD_TENSOR) {
+          PADDLE_ENFORCE(meta_.lod_size() >= 0,
+                         "lod info should be got first!");
+          if (!CopyLodTensorData(&input, *dev_ctx_, dims, length)) {
+            return tag;
+          }
+          break;
+        }
+
+        if (meta_.type() == sendrecv::SELECTED_ROWS) {
+          if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, length)) {
+            return tag;
+          }
+          break;
+        }
+
+        return tag;
+      }
+      case sendrecv::VariableMessage::kRowsFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+
+        if (!CopySelectRowsData(&input, *dev_ctx_, length)) {
+          return tag;
+        }
+        break;
+      }
+
+      default: {
+        // Unknown tag, return unknown error.
+        return -1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
new file mode 100644
index 0000000000000000000000000000000000000000..e121ed7bce966d7dea94f71087f2187dcaa17cec
--- /dev/null
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -0,0 +1,81 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class VariableResponse {
+ public:
+  VariableResponse(const framework::Scope* scope,
+                   const platform::DeviceContext* dev_ctx)
+      : scope_(scope), dev_ctx_(dev_ctx) {}
+
+  virtual ~VariableResponse() {}
+
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(Source* source);
+
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(const ::grpc::ByteBuffer& byte_buffer);
+
+  inline std::string Varname() { return meta_.varname(); }
+
+  // should call parse first.
+  framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); }
+
+ private:
+  bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
+                                const platform::DeviceContext& ctx,
+                                framework::DDim& dims, int length);
+
+  bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
+                          const platform::DeviceContext& ctx, int length);
+
+  bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
+                         const platform::DeviceContext& ctx,
+                         framework::DDim& dims, int length);
+
+ private:
+  const framework::Scope* scope_;
+  const platform::DeviceContext* dev_ctx_;
+  // only Skeleton
+  sendrecv::VariableMessage meta_;
+};
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 73c84c2fe0155d21d7059938330e44fa3668c6df..93ef15b9332168a9c62abfd4d0827207173ece45 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -188,8 +188,8 @@ The general steps are as follows. First, calculate the true positive and
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(detection_map, ops::DetectionMAPOp,
-                             ops::DetectionMAPOpMaker);
+REGISTER_OPERATOR(detection_map, ops::DetectionMAPOp, ops::DetectionMAPOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
     detection_map, ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, float>,
     ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 1074ed6acc22a81f46c466d917ef973945a12898..e4436549f6185ba04a5f270893596a6dcb11e89b 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -35,7 +35,6 @@ class DropoutOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename AttrType>
 class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
@@ -73,7 +72,6 @@ are set equal to their corresponding inputs.
   }
 };
 
-template <typename AttrType>
 class DropoutOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -103,11 +101,10 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
-            ops::DropoutOpGrad<float>);
+REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad,
+            ops::DropoutOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    dropout,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
+    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     dropout_grad,
     ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index d6f9c04359d733cb4f3f0586e9239ee67deb7078..94382739b5077b1449a8fd5be7952f35737ca340 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -18,20 +18,22 @@ limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T, typename AttrType>
+template <typename T>
 __global__ void RandomGenerator(const size_t n, const int seed,
-                                const AttrType dropout_prob, const T* src,
+                                const float dropout_prob, const T* src,
                                 T* mask_data, T* dst) {
   thrust::minstd_rand rng;
   rng.seed(seed);
-  thrust::uniform_real_distribution<AttrType> dist(0, 1);
+  thrust::uniform_real_distribution<float> dist(0, 1);
 
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (; idx < n; idx += blockDim.x * gridDim.x) {
+    rng.discard(idx);
     if (dist(rng) < dropout_prob) {
       mask_data[idx] = static_cast<T>(0);
     } else {
@@ -44,14 +46,14 @@ __global__ void RandomGenerator(const size_t n, const int seed,
 // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
-template <typename Place, typename T, typename AttrType>
+template <typename Place, typename T>
 class GPUDropoutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
     auto* y = context.Output<Tensor>("Out");
     y->mutable_data<T>(context.GetPlace());
-    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
+    float dropout_prob = context.Attr<float>("dropout_prob");
 
     auto X = EigenMatrix<T>::Reshape(*x, 1);
     auto Y = EigenMatrix<T>::Reshape(*y, 1);
@@ -70,11 +72,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
 
       int threads = 512;
       int grid = (x->numel() + threads - 1) / threads;
-      RandomGenerator<T, AttrType><<<grid, threads, 0,
-                                     context.cuda_device_context().stream()>>>(
+      RandomGenerator<
+          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
           size, seed, dropout_prob, x_data, mask_data, y_data);
     } else {
-      Y.device(place) = X * (1.0f - dropout_prob);
+      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
     }
   }
 };
@@ -83,9 +85,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    dropout,
-    ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>);
-REGISTER_OP_CUDA_KERNEL(
-    dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
+    dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
+    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(dropout_grad,
+                        ops::DropoutGradKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index 209e4dec1756dc65fbf147c4dbbf0913d3c6ef7e..b5ee86ae2d11dfc835e1a3a6826ce016baf38a29 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename DeviceContext, typename T, typename AttrType>
+template <typename DeviceContext, typename T>
 class CPUDropoutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu
index 19dc4a52152e2a7aa71476d4f0ef692d0af97b4a..dfff518f170b56d180b6883c363effb8dbd677b6 100644
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -14,19 +14,20 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
-    elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
index 90c31877f6a87d1e237283d489353b4aba26c97b..debacf07c360b9aa69000a0d891f04239ed08807 100644
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -28,6 +29,10 @@ class FeedOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    // get device context from pool
+    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    platform::RecordEvent record_event(Type(), dev_ctx);
+
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
 
@@ -50,14 +55,10 @@ class FeedOp : public framework::OperatorBase {
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
 
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
     if (platform::is_same_place(feed_item.place(), place)) {
       out_item->ShareDataWith(feed_item);
     } else {
-      framework::TensorCopy(feed_item, place, dev_ctx, out_item);
+      framework::TensorCopy(feed_item, place, *dev_ctx, out_item);
     }
     out_item->set_lod(feed_item.lod());
   }
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
index d66f01d1b7ce8528a7c0177b2889aff7e0c5a12b..7c7f3e9059fbb1e3f2cca4f04edfff55c9452761 100644
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -29,6 +30,9 @@ class FetchOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
@@ -53,7 +57,6 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(src_item.place());
 
     TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
diff --git a/paddle/fluid/operators/iou_similarity_op.cc b/paddle/fluid/operators/iou_similarity_op.cc
index ffbd7c7814c3fdec9fef0580ccd1ea3661ac0012..4b78ec510d1fb73592ee8af9a641622f4d713f8d 100755
--- a/paddle/fluid/operators/iou_similarity_op.cc
+++ b/paddle/fluid/operators/iou_similarity_op.cc
@@ -87,8 +87,9 @@ $$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp,
-                             ops::IOUSimilarityOpMaker);
+REGISTER_OPERATOR(iou_similarity, ops::IOUSimilarityOp,
+                  ops::IOUSimilarityOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 
 REGISTER_OP_CPU_KERNEL(
     iou_similarity,
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 4253300788462a3704076fc79241a864f2f130a0..08b83375dd5462e67c3da2c6c7401dd5e54793f0 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 #include "paddle/fluid/operators/detail/simple_block_queue.h"
@@ -68,9 +69,7 @@ class ListenAndServOp : public framework::OperatorBase {
   }
 
   void Stop() override {
-    detail::MessageWithName term_msg;
-    term_msg.first = LISTEN_TERMINATE_MESSAGE;
-    rpc_service_->Push(term_msg);
+    rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
     rpc_service_->ShutDown();
     server_thread_->join();
   }
@@ -89,6 +88,10 @@ class ListenAndServOp : public framework::OperatorBase {
 
     auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
     auto *program = block->Program();
+    int num_blocks = program->Size();
+    PADDLE_ENFORCE_GE(num_blocks, 2,
+                      "server program should have at least 2 blocks");
+
     framework::Executor executor(dev_place);
 
     // TODO(typhoonzero): change this to a while_op for every cluster-batch.
@@ -103,7 +106,7 @@ class ListenAndServOp : public framework::OperatorBase {
       size_t recv_var_cnt = 0;
       int batch_barrier = 0;
       while (batch_barrier != fan_in) {
-        const detail::MessageWithName &v = rpc_service_->Get();
+        const detail::ReceivedMessage v = rpc_service_->Get();
         auto recv_var_name = v.first;
         if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
           LOG(INFO) << "received terminate message and exit";
@@ -116,12 +119,11 @@ class ListenAndServOp : public framework::OperatorBase {
         } else {
           VLOG(3) << "received grad: " << recv_var_name;
           recv_var_cnt++;
-          auto *var = recv_scope.FindVar(recv_var_name);
+          auto var = v.second->GetVar();
           if (var == nullptr) {
             LOG(ERROR) << "Can not find server side var: " << recv_var_name;
             PADDLE_THROW("Can not find server side var");
           }
-          detail::DeserializeFromMessage(v.second, dev_ctx, var);
           if (var->IsType<framework::SelectedRows>()) {
             sparse_vars.push_back(var);
           }
@@ -132,12 +134,35 @@ class ListenAndServOp : public framework::OperatorBase {
         rpc_service_->ShutDown();
         break;
       }
-      try {
-        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
-                     false /*create_local_scope*/, false /*create_vars*/);
-      } catch (std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
+
+      // put optimize blocks in the thread pool to start run, the last block
+      // should be global ops.
+      // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
+      // and this will still work.
+
+      std::vector<std::future<void>> fs;
+      // block0 contains only listen_and_serv op, start run from block1.
+      for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
+        fs.push_back(
+            framework::Async([&executor, &program, &recv_scope, blkid]() {
+              int run_block = blkid;  // thread local
+              try {
+                executor.Run(*program, &recv_scope, run_block, false, false);
+              } catch (std::exception &e) {
+                LOG(ERROR) << "run sub program error " << e.what();
+              }
+            }));
+      }
+      for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait();
+      // Run global block at final step, or block1 if there are only 2 blocks
+      if (num_blocks >= 2) {
+        try {
+          executor.Run(*program, &recv_scope, num_blocks - 1, false, false);
+        } catch (std::exception &e) {
+          LOG(ERROR) << "run sub program error " << e.what();
+        }
       }
+
       // Reset the received sparse variables, the sum operator would not
       // sum the input sparse variables which rows is empty at the next
       // mini-batch.
@@ -151,6 +176,10 @@ class ListenAndServOp : public framework::OperatorBase {
       rpc_service_->WaitClientGet(fan_in);
       sparse_vars.clear();
     }  // while(true)
+
+    // for (int i = 0; i < num_blocks; ++i) {
+    //   delete blk_ctx_list[i];
+    // }
   }
 
  protected:
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 05f809ac5628420251957116bb2390b4502f11b8..6ffe0bec5e38432676ecadfa1abbbe70a1425bb1 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -29,6 +30,9 @@ class LoadOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    platform::RecordEvent record_event(Type(), dev_ctx);
+
     auto filename = Attr<std::string>("file_path");
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
@@ -41,9 +45,7 @@ class LoadOp : public framework::OperatorBase {
 
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
 
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    DeserializeFromStream(fin, tensor, dev_ctx);
+    DeserializeFromStream(fin, tensor, *dev_ctx);
 
     if (platform::is_gpu_place(place)) {
       // copy CPU to GPU
@@ -55,7 +57,7 @@ class LoadOp : public framework::OperatorBase {
       out_var->Clear();
       tensor = out_var->GetMutable<framework::LoDTensor>();
       tensor->set_lod(cpu_tensor.lod());
-      TensorCopy(cpu_tensor, place, dev_ctx, tensor);
+      TensorCopy(cpu_tensor, place, *dev_ctx, tensor);
     }
   }
 };
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 6a66297cb843ead1a507a6867c1c562224861cbf..7d5687f2d0666d393d7bb1c1a2fdde6c95e6d615 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -22,17 +22,16 @@ class LoDResetOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of LoDResetOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of LoDResetOp should not be null.");
-    // If target LoD is not set form Input(), then it must be set from Attr().
-    if (!ctx->HasInput("TargetLoD")) {
+
+    if (!ctx->HasInput("Y")) {
       auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
-      PADDLE_ENFORCE(level0.size() > 1,
-                     "Target LoD is not found, should be set to be a valid one "
-                     "through Input() or Attr().");
+      PADDLE_ENFORCE_GT(level0.size(), 1,
+                        "If Input(Y) not provided, the target lod should be "
+                        "specified by attribute `target_lod`.");
     }
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
   }
@@ -50,36 +49,77 @@ class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
-    AddInput("TargetLoD",
-             "(Tensor, optional) The target level 0 LoD from Input().")
+    AddInput("X",
+             "(Tensor, LoDTensor) Input variable of LoDResetOp which "
+             "could be a Tensor or LoDTensor, where the data of output "
+             "variable inherits from.");
+    AddInput("Y",
+             "(Tensor, LoDTensor, optional) If provided and Y is LoDTensor, "
+             "lod of Input(Y) would be considered as the target lod first, "
+             "otherwise data of Input(Y) would be considered as the "
+             "target lod.")
         .AsDispensable();
-    AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator.");
+    AddOutput("Out",
+              "(LoDTensor) Output variable of LoDResetOp which should be a "
+              "LoDTensor.");
     AddAttr<std::vector<int>>("target_lod",
                               "The target level 0 LoD from Attr().")
         .SetDefault(std::vector<int>{});
     AddComment(R"DOC(LoDReset operator
 
-Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or
-Attr(target_lod), or set LoD for Input(X) if it doesn't have one.
-Currently the lod_reset operator only supports the reset of level 0 LoD.
-At least one of Input(TargetLoD) and Attr(target_lod) must be set,
-and if both of them are set, Input(TargetLoD) will be chosen as the
-target LoD.
+Set LoD of `X` to a new one specified by `Y` or attribute `target_lod`. When `Y`
+provided and `Y` is a LoDTensor, `Y.lod` would be considered as target LoD
+first, otherwise `Y.data` would be considered as target LoD. If `Y` is not
+provided, target LoD should be specified by attribute `target_lod`.
+If target LoD is specified by `Y.data` or `target_lod`, only one level LoD
+is supported.
+
+Example 1:
+
+Given a 1-level LoDTensor input(X):
+    X.lod =  [[ 0,     2,                   5      6 ]]
+    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    X.dims = [6, 1]
+
+attr(target_lod): [0, 4, 6]
+
+then we get a 1-level LoDTensor:
+    Out.lod =  [[ 0,                   4,            6 ]]
+    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    Out.dims = [6, 1]
+
+Example 2:
 
-An example:
-Given a float LoDTensor X with shape (6, 1), its transpose form represents
+Given a 1-level LoDTensor input(X):
+    X.lod =  [[ 0,     2,                   5      6 ]]
+    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    X.dims = [6, 1]
 
-    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+input(Y) is a Tensor:
+    Y.data = [[0, 2, 6]]
+    Y.dims = [1, 3]
 
-with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like
+then we get a 1-level LoDTensor:
+    Out.lod =  [[ 0,     2,                          6 ]]
+    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    Out.dims = [6, 1]
 
-    [1.0, 2.0], [3.0, 4.0, 5.0], [6.0].
+Example 3:
 
-If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and
-the sequences that the LoDTensor Output(Out) contains becomes:
+Given a 1-level LoDTensor input(X):
+    X.lod =  [[ 0,      2,                   5     6 ]]
+    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    X.dims = [6, 1]
 
-    [1.0, 2.0, 3.0, 4.0], [5.0, 6.0].
+input(Y) is a 2-level LoDTensor:
+    Y.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+    Y.data = [[1.1], [2.1], [3.1], [4.1], [5.1], [6.1]]
+    Y.dims = [6, 1]
+
+then we get a 2-level LoDTensor:
+    Out.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    Out.dims = [6, 1]
 
 )DOC");
   }
@@ -90,10 +130,16 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LoDResetGradOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+                   "Input(Out@Grad) of LoDResetGradOp should not be null.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
+    }
   }
 
  protected:
@@ -111,9 +157,13 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
             ops::LoDResetGradOp);
-REGISTER_OP_CPU_KERNEL(lod_reset,
-                       ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
-                       ops::LoDResetKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
+    ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
+    ops::LoDResetKernel<paddle::platform::CPUPlace, int>,
+    ops::LoDResetKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
-    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>);
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, int>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/lod_reset_op.cu b/paddle/fluid/operators/lod_reset_op.cu
index b0e87a851a77a1cc98d419a63d4d9e5e1b9dd163..888d4c12eb4e3f4fd94d8dd4178c59acd0abb23b 100644
--- a/paddle/fluid/operators/lod_reset_op.cu
+++ b/paddle/fluid/operators/lod_reset_op.cu
@@ -18,8 +18,12 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     lod_reset, ops::LoDResetKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     lod_reset_grad,
     ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index 8186d4f8262101edc723af390eee1aec4fa6f3a5..99f01c2a255ade81421c2bba95ff3d38ced6f87c 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -26,35 +26,46 @@ class LoDResetKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     auto* in = ctx.Input<framework::LoDTensor>("X");
-    auto* lod_t = ctx.Input<framework::Tensor>("TargetLoD");
+    auto* lod_t = ctx.Input<framework::LoDTensor>("Y");
+
+    out->ShareDataWith(*in);
 
     std::vector<int> level0;
     if (lod_t) {
-      auto* lod = lod_t->data<int>();
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        framework::Tensor lod_cpu;
-        framework::TensorCopy(*lod_t, platform::CPUPlace(),
-                              ctx.device_context(), &lod_cpu);
-        lod = lod_cpu.data<int>();
+      if (lod_t->lod().size() > 0) {
+        auto y_lod = lod_t->lod();
+        auto last_level = y_lod[y_lod.size() - 1];
+        PADDLE_ENFORCE_EQ(last_level.back(), in->dims()[0],
+                          "Last value of `Y`'s last level LoD should be equal "
+                          "to the first dimension of `X`");
+        out->set_lod(y_lod);
+        return;  // early return, since lod already set
+      } else {
+        auto* lod = lod_t->data<int>();
+        if (platform::is_gpu_place(ctx.GetPlace())) {
+          framework::Tensor lod_cpu;
+          framework::TensorCopy(*lod_t, platform::CPUPlace(),
+                                ctx.device_context(), &lod_cpu);
+          lod = lod_cpu.data<int>();
+        }
+        level0 = std::vector<int>(lod, lod + lod_t->numel());
       }
-      level0 = std::vector<int>(lod, lod + lod_t->numel());
     } else {
       level0 = ctx.Attr<std::vector<int>>("target_lod");
     }
 
-    PADDLE_ENFORCE(level0.size() > 1UL,
-                   "The size of target LoD should be greater than 1.");
-    PADDLE_ENFORCE(level0[0] == 0,
-                   "Target LoD should be a vector starting from 0.");
-    PADDLE_ENFORCE(level0.back() == in->dims()[0],
-                   "Target LoD should be a vector end with the "
-                   "first dimension of Input(X).");
+    PADDLE_ENFORCE_GT(level0.size(), 1UL,
+                      "Size of target LoD should be greater than 1.");
+    PADDLE_ENFORCE_EQ(level0[0], 0,
+                      "Target LoD should be a vector starting from 0.");
+    PADDLE_ENFORCE_EQ(level0.back(), in->dims()[0],
+                      "Target LoD should be a vector end with the "
+                      "first dimension of Input(X).");
     for (size_t i = 0; i < level0.size() - 1; ++i) {
       PADDLE_ENFORCE(level0[i + 1] > level0[i],
                      "Target LoD should be an ascending vector.");
     }
 
-    out->ShareDataWith(*in);
     // cast level0 to size_t
     std::vector<size_t> ulevel0(level0.size(), 0);
     std::transform(level0.begin(), level0.end(), ulevel0.begin(),
diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3bead16ce44c26b9d7a6f2a5c6b471612494d595
--- /dev/null
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -0,0 +1,209 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/lrn_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+
+namespace {
+template <typename T, typename... Args>
+std::shared_ptr<T> insert_to_context(const std::string& key,
+                                     const MKLDNNDeviceContext& dev_ctx,
+                                     Args&&... args) {
+  auto p = std::static_pointer_cast<T, void>(dev_ctx.GetBlob(key));
+
+  if (!p) {
+    p = std::make_shared<T>(args...);
+    dev_ctx.SetBlob(key, std::static_pointer_cast<void, T>(p));
+  }
+
+  return p;
+}
+}  // namespace
+
+template <typename T>
+class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(std::is_same<T, float>::value,
+                   "MKLDNN LRN must use float data.");
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "MKLDNN LRN must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto x = ctx.Input<Tensor>("X");
+    auto out = ctx.Output<Tensor>("Out");
+    auto mid = ctx.Output<Tensor>("MidOut");
+
+    auto input_data = x->data<T>();
+    auto output_data = out->mutable_data<T>(ctx.GetPlace());
+    mid->mutable_data<T>(ctx.GetPlace());
+
+    const int n = ctx.Attr<int>("n");
+    const float alpha = ctx.Attr<float>("alpha");
+    const float beta = ctx.Attr<float>("beta");
+    const float k = ctx.Attr<float>("k");
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid = e_mid.constant(k);
+
+    auto dims = paddle::framework::vectorize2int(x->dims());
+
+    auto src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto dst_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
+                                                  mkldnn::lrn_across_channels,
+                                                  src_md,
+                                                  n,
+                                                  alpha,
+                                                  beta,
+                                                  k};
+
+    auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
+    auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
+                                     static_cast<void*>(output_data)};
+
+    std::unique_ptr<mkldnn::lrn_forward> forward_op = nullptr;
+
+    if (!is_test) {
+      const std::string key = ctx.op().Output("Out");
+      const std::string key_src_memory = key + "@lrn_src_memory";
+      const std::string key_pd = key + "@lrn_pd";
+      const std::string key_workspace_memory = key + "@lrn_workspace_memory";
+
+      auto forward_pd = insert_to_context<mkldnn::lrn_forward::primitive_desc>(
+          key_pd, dev_ctx, forward_desc, mkldnn_engine);
+
+      auto src_memory = insert_to_context<mkldnn::memory>(
+          key_src_memory, dev_ctx, src_memory_pd);
+
+      src_memory->set_data_handle(
+          static_cast<void*>(const_cast<T*>(input_data)));
+
+      auto workspace_memory = insert_to_context<mkldnn::memory>(
+          key_workspace_memory, dev_ctx,
+          forward_pd->workspace_primitive_desc());
+
+      forward_op.reset(new mkldnn::lrn_forward{*forward_pd, *src_memory,
+                                               *workspace_memory, dst_memory});
+
+    } else {
+      auto forward_pd =
+          mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
+      auto src_memory = mkldnn::memory{
+          src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
+      auto workspace_memory =
+          mkldnn::memory{forward_pd.workspace_primitive_desc()};
+
+      forward_op.reset(new mkldnn::lrn_forward{forward_pd, src_memory,
+                                               workspace_memory, dst_memory});
+    }
+
+    std::vector<mkldnn::primitive> pipeline = {*forward_op};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
+
+template <typename T>
+class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(std::is_same<T, float>::value,
+                   "MKLDNN LRN must use float data.");
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "MKLDNN LRN must use CPUPlace.");
+
+    auto x = ctx.Input<Tensor>("X");
+
+    auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    const std::string key = ctx.op().Input("Out");
+    const std::string key_src_memory = key + "@lrn_src_memory";
+    const std::string key_pd = key + "@lrn_pd";
+    const std::string key_workspace_memory = key + "@lrn_workspace_memory";
+
+    const int n = ctx.Attr<int>("n");
+    const float alpha = ctx.Attr<float>("alpha");
+    const float beta = ctx.Attr<float>("beta");
+    const float k = ctx.Attr<float>("k");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
+    auto out_grad_data = out_grad->data<T>();
+
+    auto dims = paddle::framework::vectorize2int(x->dims());
+
+    auto src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_dst_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_dst_memory =
+        mkldnn::memory{{diff_dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<float*>(out_grad_data))};
+
+    auto diff_src_memory = mkldnn::memory{{diff_src_md, mkldnn_engine},
+                                          static_cast<void*>(x_grad_data)};
+
+    auto backward_desc = mkldnn::lrn_backward::desc{
+        mkldnn::lrn_across_channels, src_md, diff_src_md, n, alpha, beta, k};
+
+    auto forward_pd = dev_ctx.GetBlob(key_pd);
+
+    auto backward_pd = mkldnn::lrn_backward::primitive_desc{
+        backward_desc, mkldnn_engine,
+        *static_cast<mkldnn::lrn_forward::primitive_desc*>(forward_pd.get())};
+
+    std::shared_ptr<void> workspace_memory =
+        dev_ctx.GetBlob(key_workspace_memory);
+
+    auto src_memory = dev_ctx.GetBlob(key_src_memory);
+    auto backward_op = mkldnn::lrn_backward{
+        backward_pd, *static_cast<mkldnn::memory*>(src_memory.get()),
+        diff_dst_memory, *static_cast<mkldnn::memory*>(workspace_memory.get()),
+        diff_src_memory};
+
+    std::vector<mkldnn::primitive> pipeline = {backward_op};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(lrn, MKLDNN, paddle::platform::CPUPlace,
+                   ops::LRNMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(lrn_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::LRNMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 692e85dcffa583abcb22a1629953badc67489efa..2b1947a187bbd17871107553127647032ac7d7f9 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lrn_op.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -116,6 +119,26 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> {
 template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
 template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
 
+namespace {
+framework::OpKernelType GetExpectedLRNKernel(
+    const framework::ExecutionContext& ctx) {
+  framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+  if (library_ == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kMKLDNN;
+  }
+#endif
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+}  // namespace
+
 class LRNOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -135,6 +158,11 @@ class LRNOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("MidOut", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetExpectedLRNKernel(ctx);
+  }
 };
 
 template <typename T>
@@ -176,6 +204,17 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
                "beta is the power number.")
         .SetDefault(0.75)
         .GreaterThan(0.0);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
+    AddAttr<bool>("is_test", "").SetDefault(false);
 
     AddComment(R"DOC(
 Local Response Normalization Operator.
@@ -223,8 +262,12 @@ class LRNOpGrad : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
-};
 
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetExpectedLRNKernel(ctx);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index fba1612d10f0494f4ab06fabdd0e799a74dafd53..ee0e91132bce52998e9c45b37335618e4354e1cd 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,6 +6,7 @@ function(math_library TARGET)
     # But it handle split GPU/CPU code and link some common library.
     set(cc_srcs)
     set(cu_srcs)
+    set(hip_srcs)
     set(math_common_deps device_context framework_proto)
     set(multiValueArgs DEPS)
     cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
@@ -17,10 +18,15 @@ function(math_library TARGET)
     if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
         list(APPEND cu_srcs ${TARGET}.cu)
     endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+        list(APPEND hip_srcs ${TARGET}.hip.cu)
+    endif()
 
     list(LENGTH cc_srcs cc_srcs_len)
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
     elseif(${cc_srcs_len} GREATER 0)
         cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
     endif()
@@ -43,7 +49,7 @@ math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
-math_library(softmax)
+math_library(softmax DEPS math_function)
 math_library(unpooling)
 math_library(vol2col)
 
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc
index b542143419e05e9baf29e9a2322447f32ddd9829..bfce56f9fdcafa0800c9742b9fae41fd6a572b40 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -20,7 +20,7 @@ namespace math {
 
 /*
  * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
 class ConcatFunctor<platform::CPUDeviceContext, T> {
@@ -44,7 +44,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
       out_cols += t_cols;
       input_cols[i] = t_cols;
     }
-    auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
 
     // computation
     for (int k = 0; k < out_rows; ++k) {
@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
 
 /*
  * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {
@@ -87,7 +87,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
       input_cols += t_cols;
       output_cols[i] = t_cols;
     }
-    auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
 
     // computation
     for (int k = 0; k < input_rows; ++k) {
diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu
index 60b266f08fb2d4217c5933902d69de96fc2abe22..c0786757b34195d47c3b1cadc938f0e9fcfd6038 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -66,68 +66,66 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
 }
 
 template <typename T>
-__global__ void KernelConcat(T** inputs, const int input_col,
-                             const int output_rows, const int output_cols,
-                             T* output) {
+__global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
+                             const int out_rows, const int out_cols,
+                             T* output_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * inv_input_col;
-    int in_offset = tid_x - split * input_col;
-    T* input_ptr = inputs[split];
+  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * 1.0 / fixed_in_col;
+    int in_offset = tid_x - split * fixed_in_col;
+    T* input_ptr = inputs_data[split];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
-      output[tid_y * output_cols + tid_x] =
-          input_ptr[tid_y * input_col + in_offset];
+    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
+      output_data[tid_y * out_cols + tid_x] =
+          input_ptr[tid_y * fixed_in_col + in_offset];
     }
   }
 }
 
 template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col, const int* output_cols,
-                                 int col_size, T** outputs) {
+__global__ void KernelConcatGrad(const T* input_data, const int in_row,
+                                 const int in_col, const int* out_cols,
+                                 int out_cols_size, T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
-  int curr_offset = output_cols[segment];
+  int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
+  int curr_offset = out_cols[segment];
   int curr_segment = segment;
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
     T curr_col_offset;
-    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
+    while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
       curr_offset = curr_col_offset;
       ++curr_segment;
     }
 
     int local_col = tid_x - curr_offset;
     int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs[curr_segment];
+    T* output_ptr = outputs_data[curr_segment];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
       output_ptr[tid_y * segment_width + local_col] =
-          input[tid_y * input_col + tid_x];
+          input_data[tid_y * in_col + tid_x];
   }
 }
 
 template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col, const int output_cols,
-                                 T** outputs) {
+__global__ void KernelConcatGrad(const T* input_data, const int in_row,
+                                 const int in_col, const int fixed_out_col,
+                                 T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * inv_input_col;
-    int in_offset = tid_x - split * input_col;
-    T* output_ptr = outputs[split];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
+    int in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = outputs_data[split];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * output_cols + in_offset] =
-          input[tid_y * input_col + tid_x];
+    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+      output_ptr[tid_y * fixed_out_col + in_offset] =
+          input_data[tid_y * in_col + tid_x];
   }
 }
 
 /*
  * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
 class ConcatFunctor<platform::CUDADeviceContext, T> {
@@ -136,41 +134,40 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<framework::Tensor>& input, const int axis,
                   framework::Tensor* output) {
     // TODO(zcd): Add input data validity checking
-    int num = input.size();
-    int rows = 1;
+    int in_num = input.size();
+    int in_row = 1;
     auto dim_0 = input[0].dims();
     for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
+      in_row *= dim_0[i];
     }
-    int cols = input[0].numel() / rows;
-    int out_rows = rows, out_cols = 0;
+    int in_col = input[0].numel() / in_row;
+    int out_row = in_row, out_col = 0;
 
-    framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
-    framework::Vector<int> inputs_cols(num + 1);
-    inputs_cols[0] = 0;
+    framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
+    framework::Vector<int> inputs_col(in_num + 1);
     T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
 
+    inputs_col[0] = 0;
     bool sameShape = true;
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
+    for (int i = 0; i < in_num; ++i) {
+      int t_cols = input[i].numel() / in_row;
       if (sameShape) {
-        if (t_cols != cols) sameShape = false;
+        if (t_cols != in_col) sameShape = false;
       }
-      out_cols += t_cols;
-      inputs_cols[i + 1] = out_cols;
+      out_col += t_cols;
+      inputs_col[i + 1] = out_col;
       inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
     }
 
-    T** ins_gpu =
+    T** dev_ins_data =
         reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
-    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
 
     // computation
     // set the thread block and grid according to CurrentDeviceId
     const int kThreadsPerBlock = 1024;
     int block_cols = kThreadsPerBlock;
-    if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((out_cols + 31) >> 5) << 5;
+    if (out_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((out_col + 31) >> 5) << 5;
     }
     int block_rows = kThreadsPerBlock / block_cols;
     dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -179,25 +176,26 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
 
     int grid_cols =
-        std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
+        std::min((out_col + block_cols - 1) / block_cols, max_blocks);
     int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
     if (sameShape) {
       KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          ins_gpu, cols, out_rows, out_cols, output->data<T>());
+          dev_ins_data, in_col, out_row, out_col, output->data<T>());
     } else {
+      const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
       KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
-          out_cols, output->data<T>());
+          dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
+          out_row, out_col, output->data<T>());
     }
   }
 };
 
 /*
  * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {
@@ -206,41 +204,40 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& input, const int axis,
                   std::vector<framework::Tensor>& outputs) {
     // TODO(zcd): Add input data validity checking
-    int num = outputs.size();
-    int input_row = 1;
+    int o_num = outputs.size();
+    int out_row = 1;
     auto dim_0 = outputs[0].dims();
     for (int i = 0; i < axis; ++i) {
-      input_row *= dim_0[i];
+      out_row *= dim_0[i];
     }
 
-    int output_col_0 = outputs[0].numel() / input_row;
-    int input_col = 0;
+    int out_col = outputs[0].numel() / out_row;
+    int in_col = 0, in_row = out_row;
     bool sameShape = true;
 
-    framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
-    framework::Vector<int> outputs_cols(num + 1);
-    outputs_cols[0] = 0;
+    framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
+    framework::Vector<int> outputs_cols(o_num + 1);
     T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
 
-    for (int i = 0; i < num; ++i) {
-      int t_col = outputs[i].numel() / input_row;
+    outputs_cols[0] = 0;
+    for (int i = 0; i < o_num; ++i) {
+      int t_col = outputs[i].numel() / out_row;
       if (sameShape) {
-        if (t_col != output_col_0) sameShape = false;
+        if (t_col != out_col) sameShape = false;
       }
-      input_col += t_col;
-      outputs_cols[i + 1] = input_col;
+      in_col += t_col;
+      outputs_cols[i + 1] = in_col;
       outputs_ptr[i] = outputs[i].data<T>();
     }
 
-    T** outs_gpu =
+    T** dev_out_gpu_data =
         reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
-    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
 
     // computation
     const int kThreadsPerBlock = 1024;
     int block_cols = kThreadsPerBlock;
-    if (input_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((input_col + 31) >> 5) << 5;
+    if (in_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((in_col + 31) >> 5) << 5;
     }
     int block_rows = kThreadsPerBlock / block_cols;
     dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -249,18 +246,19 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
     int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
 
     int grid_cols =
-        std::min((input_col + block_cols - 1) / block_cols, max_blocks);
+        std::min((in_col + block_cols - 1) / block_cols, max_blocks);
     int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
     if (sameShape) {
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
+          input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
     } else {
+      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, outs_col_gpu,
-          static_cast<int>(outputs_cols.size()), outs_gpu);
+          input.data<T>(), in_row, in_col, dev_outs_col_data,
+          static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
     }
   }
 };
diff --git a/paddle/fluid/operators/math/concat.hip.cu b/paddle/fluid/operators/math/concat.hip.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eacef0438883891671fec6e4001f862f619723cb
--- /dev/null
+++ b/paddle/fluid/operators/math/concat.hip.cu
@@ -0,0 +1,15 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <hip/hip_runtime.h>
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 35d251f71a0cb631d5900498ea3188b5ddeae334..299a0aed01dfe0448d896738d9fd33319b1b2887 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -278,6 +278,7 @@ void axpy<platform::CPUDeviceContext, double>(
   cblas_daxpy(n, alpha, x, 1, y, 1);
 }
 
+template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
 template struct SetConstant<platform::CPUDeviceContext, float>;
 template struct SetConstant<platform::CPUDeviceContext, double>;
 template struct SetConstant<platform::CPUDeviceContext, int>;
@@ -371,6 +372,8 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>;
 
 template struct ColwiseSum<platform::CPUDeviceContext, float>;
 template struct ColwiseSum<platform::CPUDeviceContext, double>;
+template struct ColwiseSum<platform::CPUDeviceContext, int>;
+template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
 
 template struct RowwiseSum<platform::CPUDeviceContext, float>;
 template struct RowwiseSum<platform::CPUDeviceContext, double>;
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 3abbcdb71d03eaf6f8eba3d97150d27ac5a5405e..1e909db5288afccb9dd0be08a45cf3c27048ae6f 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -348,6 +348,7 @@ void axpy<platform::CUDADeviceContext, double>(
                                                 &alpha, x, 1, y, 1));
 }
 
+template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
 template struct SetConstant<platform::CUDADeviceContext, double>;
 template struct SetConstant<platform::CUDADeviceContext, int>;
@@ -422,6 +423,8 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> {
 template struct RowwiseAdd<platform::CUDADeviceContext, float>;
 template struct RowwiseAdd<platform::CUDADeviceContext, double>;
 template struct ColwiseSum<platform::CUDADeviceContext, float>;
+template struct ColwiseSum<platform::CUDADeviceContext, int>;
+template struct ColwiseSum<platform::CUDADeviceContext, int64_t>;
 // template struct ColwiseSum<platform::CUDADeviceContext, double>;
 // The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
 // and only failed for this case. So reimplemented it.
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 34ea6a91ce7743462d378cf471a5ec3a12ca51d1..5518ebed3f792a5acdfbb27976bc2c6dbd78069a 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -89,6 +89,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       XGrad->mutable_data<T>(context.GetPlace())));
 }
 
+template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<float>;
diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc
index 0e81d60878dce747b047abbe4641b71462373b2b..277901cff493445e1e85e92e22ea0ada0e1cba43 100644
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -324,8 +324,9 @@ MatchIndices elements with value -1.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp,
-                             ops::MineHardExamplesOpMaker);
+REGISTER_OPERATOR(mine_hard_examples, ops::MineHardExamplesOp,
+                  ops::MineHardExamplesOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 
 REGISTER_OP_CPU_KERNEL(
     mine_hard_examples,
diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn_activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..083d03ebe610521c5a4beb7b977a8179700bcf40
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(context.Input<framework::Tensor>("X") != nullptr,
+                   "Cannot get input tensor X, variable name = %s",
+                   context.op().Input("X"));
+    PADDLE_ENFORCE(context.Output<framework::Tensor>("Out") != nullptr,
+                   "Cannot find output tensor Out, variable name = %s",
+                   context.op().Output("Out"));
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+template <typename Functor>
+class MKLDNNActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+namespace {
+framework::OpKernelType GetKernelType(
+    const framework::ExecutionContext& ctx,
+    const framework::OperatorWithKernel& oper) {
+  framework::LibraryType library{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+  if (library == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+  }
+#endif
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+      ctx.GetPlace(), layout, library);
+}
+}  // anonymous namespace
+
+class ActivationWithMKLDNNOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this);
+  }
+};
+
+class ActivationWithMKLDNNOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 781d96981e4c033d9287ab3de9860dfd9fcd2875..39c862b03ad497dca5c38ccecff20be510ab60e5 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -24,6 +24,8 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
 using DataLayout = platform::DataLayout;
 using PoolingMode = platform::PoolingMode;
+template <typename T>
+using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 
 template <typename T>
 class PoolCUDNNOpKernel : public framework::OpKernel<T> {
@@ -78,8 +80,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
 
     // ------------------- cudnn pool algorithm ---------------------
     auto handle = ctx.cuda_device_context().cudnn_handle();
-    T alpha = 1.0f, beta = 0.0f;
-
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
         cudnn_output_desc, output_data));
@@ -144,8 +145,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
     // ------------------- cudnn pool algorithm ---------------------
     auto handle = ctx.cuda_device_context().cudnn_handle();
-    T alpha = 1.0f, beta = 0.0f;
-
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
@@ -162,17 +162,19 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-REGISTER_OP_KERNEL(pool2d, CUDNN, ::paddle::platform::CUDAPlace,
+REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>);
-REGISTER_OP_KERNEL(pool2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNOpKernel<double>,
+                   ops::PoolCUDNNOpKernel<plat::float16>);
+REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNGradOpKernel<float>,
                    ops::PoolCUDNNGradOpKernel<double>);
 
-REGISTER_OP_KERNEL(pool3d, CUDNN, ::paddle::platform::CUDAPlace,
+REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<float>,
                    ops::PoolCUDNNOpKernel<double>);
-REGISTER_OP_KERNEL(pool3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNGradOpKernel<float>,
                    ops::PoolCUDNNGradOpKernel<double>);
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index d78da10016a0e2b1d9a0ca9f3dfe4e8009bbe61d..b144ec5f7d315cb340dcd94b4a519bfcfd2a0e66 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -124,11 +124,15 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
   }
 #endif
 
+  auto input_data_type = framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  if (input_data_type == framework::proto::VarType::FP16) {
+    PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
+                      "float16 can only be used when CUDNN is used");
+  }
   std::string data_format = ctx.Attr<std::string>("data_format");
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-      layout_, library_);
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
+                                 library_);
 }
 
 Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
index 7ba55437cb20f802cc12ceea7777d7d78bba62a6..c22a55bce263423d5c17fffdb06b7ece02ae26da 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -168,7 +168,9 @@ https://arxiv.org/abs/1512.02325.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker);
+REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
 REGISTER_OP_CPU_KERNEL(
     prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
     ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 744bd3b7ef71f83ad82979eb966369c2e9456a7d..6fa0195b9ae103418beb56cc4b0fa9ab59e93108 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -15,10 +15,12 @@ function(reader_library TARGET_NAME)
         PARENT_SCOPE)
 endfunction()
 
+reader_library(open_files_op SRCS open_files_op.cc)
 reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
 reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
 reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
+reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 # Export local libraries to parent
 set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index d0de092947eb04a1b7d06dedea919f6b1094dd06..76cdb794ccdb4a015ae8630940a5c26845e7a7b3 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -48,20 +48,24 @@ class DoubleBufferReader : public framework::DecoratedReader {
 
   void start_thread() {
     buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize);
-    std::thread prefetch([this] { PrefetchThreadFunc(); });
-    prefetch.detach();
+    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   void ReInit() override;
 
-  ~DoubleBufferReader() { buffer_->Close(); }
+  ~DoubleBufferReader() {
+    buffer_->Close();
+    prefetcher_.join();
+    delete buffer_;
+  }
 
   bool HasNext() const override;
 
  private:
   void PrefetchThreadFunc();
 
+  std::thread prefetcher_;
   framework::Channel<Item>* buffer_;
   platform::Place place_;
   std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
@@ -120,10 +124,13 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
 };
 
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  if (!HasNext()) {
+    PADDLE_THROW("There is no next data!");
+  }
+
   if (local_buffer_.payloads_.empty()) {
     buffer_->Receive(&local_buffer_);
   }
-
   *out = local_buffer_.payloads_;
   local_buffer_.payloads_.clear();
   if (local_buffer_.ctx_) {
@@ -134,6 +141,8 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
 void DoubleBufferReader::ReInit() {
   reader_->ReInit();
   buffer_->Close();
+  prefetcher_.join();
+  delete buffer_;
   start_thread();
 }
 
@@ -159,11 +168,12 @@ void DoubleBufferReader::PrefetchThreadFunc() {
 
     if (!buffer_->Send(&batch)) {
       VLOG(5) << "WARNING: The double buffer channel has been closed. The "
-                 "prefetch thread terminates.";
+                 "prefetch thread will terminate.";
       break;
     }
   }
   buffer_->Close();
+  VLOG(5) << "Prefetch thread terminates.";
 }
 
 bool DoubleBufferReader::HasNext() const {
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d4e9fb909eafea5328491a4097276577f28a5ba
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -0,0 +1,101 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class MultiPassReader : public framework::DecoratedReader {
+ public:
+  MultiPassReader(ReaderBase* reader, int pass_num)
+      : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!HasNext()) {
+      PADDLE_THROW("There is no next data!");
+    }
+    reader_->ReadNext(out);
+  }
+
+  bool HasNext() const override {
+    if (reader_->HasNext()) {
+      return true;
+    } else {
+      ++pass_count_;
+      if (pass_count_ >= pass_num_) {
+        return false;
+      } else {
+        reader_->ReInit();
+        return true;
+      }
+    }
+  }
+
+  void ReInit() override {
+    pass_count_ = 0;
+    reader_->ReInit();
+  }
+
+ private:
+  int pass_num_;
+  mutable int pass_count_;
+};
+
+class CreateMultiPassReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto& out = detail::Ref(scope.FindVar(Output("Out")));
+    int pass_num = Attr<int>("pass_num");
+    out.GetMutable<framework::ReaderHolder>()->Reset(
+        new MultiPassReader(underlying_reader.Get(), pass_num));
+  }
+};
+
+class CreateMultiPassReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateMultiPassReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddAttr<int>("pass_num", "The number of pass to run.").GreaterThan(0);
+    AddComment(R"DOC(
+      CreateMultiPassReader Operator
+
+      This operator creates a multi-pass reader. A multi-pass reader 
+      is used to yield data for several pass training continuously. 
+      It takes the the number of pass to run as one of its attributes
+      ('pass_num'), and maintains a pass counter to record how many 
+      passes it has completed. When the underlying reader reach the EOF, 
+      the multi-pass reader checks whether it has completed training 
+      of the given number of pass. If not, the underlying reader will 
+      be re-initialized and starts a new pass automatically.
+    )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_multi_pass_reader,
+                                   ops::CreateMultiPassReaderOp,
+                                   ops::CreateMultiPassReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 70e2f587dc414a850ddc341b98f26ae54636755c..3a1f3805a0483c2f5eabdc7432556051d8308964 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -34,6 +34,9 @@ class ShuffleReader : public framework::DecoratedReader {
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!HasNext()) {
+      PADDLE_THROW("There is no next data!");
+    }
     if (iteration_pos_ >= buffer_.size()) {
       VLOG(10) << "Resetting shuffle buffer";
       ReadIntoBuffers();
@@ -50,7 +53,6 @@ class ShuffleReader : public framework::DecoratedReader {
     buffer_.clear();
     buffer_.reserve(buffer_size_);
     iteration_pos_ = 0;
-    PADDLE_ENFORCE(reader_->HasNext());
     for (size_t i = 0; i < buffer_size_; ++i) {
       if (!reader_->HasNext()) {
         break;
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..414c76fea0bb916dfeafe38c0448a7a800889e03
--- /dev/null
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -0,0 +1,212 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class MultipleReader : public framework::ReaderBase {
+ public:
+  MultipleReader(const std::vector<std::string>& file_names,
+                 const std::vector<framework::DDim>& dims, size_t thread_num)
+      : file_names_(file_names), dims_(dims) {
+    prefetchers_.resize(thread_num);
+    StartNewScheduler();
+  }
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  bool HasNext() const override;
+  void ReInit() override;
+
+  ~MultipleReader() { EndScheduler(); }
+
+ private:
+  void StartNewScheduler();
+  void EndScheduler();
+  void ScheduleThreadFunc();
+  void PrefetchThreadFunc(std::string file_name, size_t thread_idx);
+
+  std::vector<std::string> file_names_;
+  std::vector<framework::DDim> dims_;
+  std::thread scheduler_;
+  std::vector<std::thread> prefetchers_;
+  framework::Channel<size_t>* waiting_file_idx_;
+  framework::Channel<size_t>* available_thread_idx_;
+  framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
+  mutable std::vector<framework::LoDTensor> local_buffer_;
+};
+
+void MultipleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  if (!HasNext()) {
+    PADDLE_THROW("There is no next data!");
+  }
+
+  if (local_buffer_.empty()) {
+    buffer_->Receive(&local_buffer_);
+  }
+  *out = local_buffer_;
+  local_buffer_.clear();
+}
+
+bool MultipleReader::HasNext() const {
+  return local_buffer_.empty() ? buffer_->Receive(&local_buffer_) : true;
+}
+
+void MultipleReader::ReInit() {
+  EndScheduler();
+  local_buffer_.clear();
+  StartNewScheduler();
+}
+
+void MultipleReader::StartNewScheduler() {
+  size_t thread_num = prefetchers_.size();
+  waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size());
+  available_thread_idx_ = framework::MakeChannel<size_t>(thread_num);
+  buffer_ =
+      framework::MakeChannel<std::vector<framework::LoDTensor>>(thread_num);
+
+  for (size_t i = 0; i < file_names_.size(); ++i) {
+    waiting_file_idx_->Send(&i);
+  }
+  waiting_file_idx_->Close();
+  for (size_t i = 0; i < thread_num; ++i) {
+    available_thread_idx_->Send(&i);
+  }
+
+  scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
+}
+
+void MultipleReader::EndScheduler() {
+  available_thread_idx_->Close();
+  buffer_->Close();
+  waiting_file_idx_->Close();
+  if (scheduler_.joinable()) {
+    scheduler_.join();
+  }
+  delete buffer_;
+  delete available_thread_idx_;
+  delete waiting_file_idx_;
+}
+
+void MultipleReader::ScheduleThreadFunc() {
+  VLOG(5) << "MultipleReader schedule thread starts.";
+  size_t completed_thread_num = 0;
+  size_t thread_idx;
+  while (available_thread_idx_->Receive(&thread_idx)) {
+    std::thread& prefetcher = prefetchers_[thread_idx];
+    if (prefetcher.joinable()) {
+      prefetcher.join();
+    }
+    size_t file_idx;
+    if (waiting_file_idx_->Receive(&file_idx)) {
+      // Still have files to read. Start a new prefetch thread.
+      std::string file_name = file_names_[file_idx];
+      prefetcher = std::thread([this, file_name, thread_idx] {
+        PrefetchThreadFunc(file_name, thread_idx);
+      });
+    } else {
+      // No more file to read.
+      ++completed_thread_num;
+      if (completed_thread_num == prefetchers_.size()) {
+        buffer_->Close();
+        break;
+      }
+    }
+  }
+  // If users invoke ReInit() when scheduler is running, it will close the
+  // 'avaiable_thread_idx_' and prefecther threads have no way to tell scheduler
+  // to release their resource. So a check is needed before scheduler ends.
+  for (auto& p : prefetchers_) {
+    if (p.joinable()) {
+      p.join();
+    }
+  }
+  VLOG(5) << "MultipleReader schedule thread terminates.";
+}
+
+void MultipleReader::PrefetchThreadFunc(std::string file_name,
+                                        size_t thread_idx) {
+  VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
+  std::unique_ptr<framework::ReaderBase> reader =
+      CreateReaderByFileName(file_name, dims_);
+  while (reader->HasNext()) {
+    std::vector<framework::LoDTensor> ins;
+    reader->ReadNext(&ins);
+    if (!buffer_->Send(&ins)) {
+      VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
+                 "thread of file '"
+              << file_name << "' will terminate.";
+      break;
+    }
+  }
+  if (!available_thread_idx_->Send(&thread_idx)) {
+    VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
+               "Fail to send thread_idx.";
+  }
+  VLOG(5) << "The prefetch thread of file '" << file_name << "' terminates.";
+}
+
+class OpenFilesOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
+    const auto& ranks = Attr<std::vector<int>>("ranks");
+    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
+    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
+                      int(shape_concat.size()),
+                      "The accumulate of all ranks should be equal to the "
+                      "shape concat's length.");
+    const auto& file_names = Attr<std::vector<std::string>>("file_names");
+    PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
+    const size_t thread_num = Attr<int>("thread_num");
+
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new MultipleReader(
+        file_names, RestoreShapes(shape_concat, ranks), thread_num));
+  }
+};
+
+class OpenFilesOpMaker : public FileReaderMakerBase {
+ public:
+  OpenFilesOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : FileReaderMakerBase(op_proto, op_checker) {
+    AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
+    AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
+        .GreaterThan(0);
+
+    AddComment(R"DOC(
+      OpenFiles Operator
+
+      An OpenFilesOp creates a MultipleReader, which is able to 
+      read data multi-threaded from multiple files.
+    )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = paddle::operators::reader;
+
+REGISTER_FILE_READER_OPERATOR(open_files, reader::OpenFilesOp,
+                              reader::OpenFilesOpMaker);
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 0ba4f3854431742eb354f8c90eb395f5d7b32b2e..fc8dc747ff0c2286f4516d8350f75d9887361924 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -36,6 +36,21 @@ std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry() {
   return regs;
 }
 
+std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
+    const std::string& file_name, const std::vector<framework::DDim>& dims) {
+  size_t separator_pos = file_name.find_last_of(kFileFormatSeparator);
+  PADDLE_ENFORCE_NE(separator_pos, std::string::npos,
+                    "File name illegal! A legal file name should be like: "
+                    "[file_name].[file_format] (e.g., 'data_file.recordio').");
+  std::string filetype = file_name.substr(separator_pos + 1);
+
+  auto itor = FileReaderRegistry().find(filetype);
+  PADDLE_ENFORCE(itor != FileReaderRegistry().end(),
+                 "No file reader registered for '%s' format.", filetype);
+  framework::ReaderBase* reader = (itor->second)(file_name, dims);
+  return std::unique_ptr<framework::ReaderBase>(reader);
+}
+
 FileReaderMakerBase::FileReaderMakerBase(
     framework::OpProtoAndCheckerMaker::OpProto* op_proto,
     framework::OpAttrChecker* op_checker)
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 58f9b4ba35546571fd3b1d0c3ce128f18e248f01..929d32ad8b367865e33530f8517343c513ee9878 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -21,6 +21,8 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
+static constexpr char kFileFormatSeparator[] = ".";
+
 using FileReaderCreator = std::function<framework::ReaderBase*(
     const std::string&, const std::vector<framework::DDim>&)>;
 
@@ -29,12 +31,15 @@ std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry();
 template <typename Reader>
 int RegisterFileReader(const std::string& filetype) {
   FileReaderRegistry()[filetype] = [](
-      const std::string& fn, const std::vector<paddle::framework::DDim>& dim) {
-    return new Reader(fn, dim);
+      const std::string& fn, const std::vector<framework::DDim>& dims) {
+    return new Reader(fn, dims);
   };
   return 0;
 }
 
+std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
+    const std::string& file_name, const std::vector<framework::DDim>& dims);
+
 extern std::vector<framework::DDim> RestoreShapes(
     const std::vector<int>& shape_concat, const std::vector<int>& ranks);
 
diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc
index 8344a239df7b3fcbe91f91a17a3c5958013b55a6..c0bf0ff927481bc4da9cd6c4bb9b0c4a6841c891 100644
--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
@@ -27,6 +27,7 @@ namespace operators {
 
 static constexpr char kX[] = "X";
 static constexpr char kCaseToExecute[] = "case_to_execute";
+static constexpr char kOutputs[] = "Out";
 
 static constexpr char kCases[] = "cases";
 static constexpr char kCasesBlock[] = "sub_block";
@@ -388,6 +389,10 @@ class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Int) The variable the sets the index of the case to execute, "
              "after evaluating the channels being sent to and received from")
         .AsDuplicable();
+    AddOutput(kOutputs,
+              "A set of variables, which will be assigned with values "
+              "generated by the operators inside the cases of Select Op.")
+        .AsDuplicable();
     AddAttr<std::vector<std::string>>(kCases,
                                       "(String vector) Serialized list of"
                                       "all cases in the select op. Each"
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d02a6f29177536562e38372eb0336424aa0a47c
--- /dev/null
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include <future>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
+namespace paddle {
+namespace operators {
+
+class SendBarrierOp : public framework::OperatorBase {
+ public:
+  SendBarrierOp(const std::string& type,
+                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap& outputs,
+                const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
+
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+
+    // need to wait before sending send_barrier message
+    PADDLE_ENFORCE(rpc_client->Wait());
+
+    for (auto& ep : eps) {
+      VLOG(3) << "send barrier, ep: " << ep;
+      rpc_client->AsyncSendBatchBarrier(ep);
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+  }
+};
+
+class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendBarrierOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which is"
+              "initialized at most once.");
+    AddComment(R"DOC(
+SendBarrier operator
+
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+
+class SendBarrierOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("RPCClient").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class SendBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp,
+                  paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker,
+                  ops::SendBarrierOpVarTypeInference,
+                  ops::SendBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 443f40e803ea31c3961ed77842bd0775e0f74f35..fdf3c06ef0a7c2daa7c484375065ac2110e07478 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
     auto client_var_name = Output("RPCClient");
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
                             "Can not find variable '%s' in the scope.",
@@ -68,7 +72,7 @@ class SendOp : public framework::OperatorBase {
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        VLOG(2) << "sending " << ins[i] << " to " << epmap[i];
         rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
@@ -77,20 +81,20 @@ class SendOp : public framework::OperatorBase {
     PADDLE_ENFORCE(rpc_client->Wait());
 
     for (auto& ep : endpoints) {
-      VLOG(3) << "batch barrier, ep: " << ep;
+      VLOG(2) << "batch barrier, ep: " << ep;
       rpc_client->AsyncSendBatchBarrier(ep);
     }
     PADDLE_ENFORCE(rpc_client->Wait());
 
     if (outs.size() > 0) {
       for (size_t i = 0; i < outs.size(); i++) {
-        VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+        VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
         rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
       }
       PADDLE_ENFORCE(rpc_client->Wait());
       // tell pservers that current trainer have called fetch
       for (auto& ep : endpoints) {
-        VLOG(3) << "send fetch barrier, ep: " << ep;
+        VLOG(2) << "send fetch barrier, ep: " << ep;
         rpc_client->AsyncSendFetchBarrier(ep);
       }
       PADDLE_ENFORCE(rpc_client->Wait());
diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..523e9e27808e428acb7900fe90a29de80f316bfb
--- /dev/null
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include <future>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
+namespace paddle {
+namespace operators {
+static bool NeedSend(const framework::Scope& scope,
+                     const std::string& varname) {
+  auto* var = scope.FindVar(varname);
+  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
+                          varname);
+  if (var->IsType<framework::LoDTensor>()) {
+    return var->Get<framework::LoDTensor>().IsInitialized();
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
+  } else {
+    PADDLE_THROW(
+        "Variable type in send side should be in "
+        "[LodTensor, SelectedRows]");
+  }
+  return false;
+}
+
+class SendVarsOp : public framework::OperatorBase {
+ public:
+  SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs,
+             const framework::VariableNameMap& outputs,
+             const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    auto ins = Inputs("X");
+
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    int sync_send = Attr<int>("sync_sent");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+
+    for (size_t i = 0; i < ins.size(); i++) {
+      if (NeedSend(scope, ins[i])) {
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        // TODO(Yancey1989): we need to use an IO threadpool which has
+        // a larger number of threads than the computing threadpool.
+        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+      } else {
+        VLOG(3) << "don't send no-initialied variable: " << ins[i];
+      }
+    }
+    if (sync_send) {
+      rpc_client->Wait();
+    }
+  }
+};
+
+class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendVarsOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
+        .AsDuplicable();
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which will be"
+              "initialized at most once.");
+    AddComment(R"DOC(
+Send operator
+
+This operator will send variables to listen_and_serve op at the parameter server.
+)DOC");
+    AddAttr<int>("ync_send",
+                 "(int, default 0)"
+                 "sync send or async send.")
+        .SetDefault(0);
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+
+class SendVarsOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("RPCClient").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class SendVarsOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
+                  paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
+                  ops::SendVarsOpVarTypeInference,
+                  ops::SendVarsOpShapeInference);
diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
index a5d84d629b2e50763dac9bc571ac490414a8a406..786fe63e7580ce16b946d5049a490eed2c3c6ced 100644
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
+using framework::LoDTensor;
 
 class SequenceExpandOp : public framework::OperatorWithKernel {
  public:
@@ -25,15 +25,71 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasOutput("Out"));
-    PADDLE_ENFORCE(ctx->HasInput("Y"));
-    framework::DDim out_dim;
-    auto y_dim = ctx->GetInputDim("Y");
-    out_dim = ctx->GetInputDim("X");
-    out_dim[0] = y_dim[0];
-    ctx->ShareLoD("Y", "Out");
-    ctx->SetOutputDim("Out", out_dim);
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceExpandOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of SequenceExpandOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceExpandOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = x_dims;
+    int ref_level = ctx->Attrs().Get<int>("ref_level");
+
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Dimension number of Input(X) should be at least 2.");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      framework::Variable* y_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
+
+      auto& x_lod = x_var->Get<LoDTensor>().lod();
+      auto& y_lod = y_var->Get<LoDTensor>().lod();
+
+      PADDLE_ENFORCE_LE(x_lod.size(), 1,
+                        "Level number of Input(X)'s lod should not be "
+                        "greater than 1.");
+      PADDLE_ENFORCE_GT(y_lod.size(), 0,
+                        "Level number of Input(Y)'s lod should be "
+                        "greater than 0.");
+      PADDLE_ENFORCE(
+          ref_level == -1 ||
+              (ref_level >= 0 && ref_level < static_cast<int>(y_lod.size())),
+          "Invlid `ref_level`, which should be either equal to -1 "
+          "or in [0, %d)",
+          y_lod.size());
+
+      if (ref_level == -1) ref_level = y_lod.size() - 1;
+
+      if (x_lod.size() > 0) {
+        PADDLE_ENFORCE(x_lod[0].size() == y_lod[ref_level].size(),
+                       "Level number of Input(X)'s lod could be 0. Otherwise "
+                       "size of Input(X)'s first level lod should be equal to "
+                       "size of Input(Y)'s referred level lod.");
+      }
+
+      int64_t out_first_dim = 0;
+      if (y_lod[ref_level].size() <= 1) {
+        out_first_dim = x_dims[0];
+      } else {
+        for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+          int x_seq_len = 1;
+          if (x_lod.size() == 1) {
+            x_seq_len = x_lod[0][i] - x_lod[0][i - 1];
+          }
+          out_first_dim +=
+              (y_lod[ref_level][i] - y_lod[ref_level][i - 1]) * x_seq_len;
+        }
+      }
+      out_dims[0] = out_first_dim;
+      ctx->SetOutputDim("Out", out_dims);
+    } else {
+      out_dims[0] = -1;
+      ctx->SetOutputDim("Out", out_dims);
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
   }
 };
 
@@ -42,83 +98,81 @@ class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
   SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(Tensor or LoDTensor) The input(X) of this operator can be a "
-             "LoDTensor or a base Tensor.");
+             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor whose lod "
+             "level is at most 1.");
     AddInput("Y",
-             "(LoDTensor)The reference input(Y) of sequence_expand op."
-             "It must be a LoDTensor with k-level(k>0)."
-             "The input(X) will be expanded according to LOD of input(Y)."
-             "The element numbers of last level in input(Y) "
-             "must be equal to dims[0] of input(X).");
+             "(LoDTensor, default LoDTensor<float>) Referred LoDTensor whose "
+             "lod (specified level) is referred by Input(X).");
     AddOutput("Out",
-              "(LodTensor)The output of sequence_expand op."
-              "The lod of output will be as same as input(Y)'s lod.");
+              "(LodTensor, default LoDTensor<float>) Output LoDTensor which is "
+              "generated from Input(X) by referring lod of Input(Y).");
+    AddAttr<int>("ref_level", "Specify lod level of Input(Y).").SetDefault(-1);
     AddComment(R"DOC(
 Sequence Expand Operator.
 
-This operator expands input(X) according to LOD of input(Y).
+This operator expands `X` according to specified level lod of `Y`. Current
+implementation constaints that lod level of `X` should be at most 1. Attribute
+`ref_level` is used to specify which level lod of `Y` is referred to expand `X`.
+If set `ref_level` to -1, then last level lod of `Y` would be referred.
+Please note, rank of `X` should be at least 2, when the rank exceeds 2, `X`
+would be viewed as a 2-D tensor.
+
 Following are cases to better explain how this works:
+
 Case 1:
 
-Given a 2-level LoDTensor input(X)
-    X.lod = [[0,       2, 3],
-             [0, 1,    3, 4]]
-    X.data = [a, b, c, d]
+Given a 1-level LoDTensor input(X)
+    X.lod =  [[0,   2,        4]]
+    X.data = [[a], [b], [c], [d]]
     X.dims = [4, 1]
 and input(Y)
     Y.lod = [[0,    2,    4],
              [0, 3, 6, 7, 8]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 2-level LoDTensor
-    Out.lod = [[0,                2,    4],
-               [0,       3,       6, 7, 8]]
-    Out.data = [a, a, a, b, b, b, c, d]
+ref_level: 0
+then we get 1-level LoDTensor
+    Out.lod =  [[0,   2,        4,        6,        8]]
+    Out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
     Out.dims = [8, 1]
 
 Case 2:
 
+Given 1-level LoDTensor input(X)
+    X.lod =  [[0,   1,        4]]
+    X.data = [[a], [b], [c], [d]]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0,    2,    4],
+             [0, 3, 6, 6, 8]]
+ref_level: 0
+then we get 1-level LoDTensor
+    Out.lod =  [[0,   1,   2,        5,             8]]
+    Out.data = [[a], [a], [b], [c], [d], [b], [c], [d]]
+    Out.dims = [8, 1]
+
+Case 3:
+
 Given a common Tensor input(X)
-    X.data = [a, b, c]
+    X.data = [[a], [b], [c]]
     X.dims = [3, 1]
 and input(Y)
     Y.lod = [[0, 2, 3, 6]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 1-level LoDTensor
-    Out.lod = [[0,    2, 3,      6]]
-    Out.data = [a, a, b, c, c, c]
+ref_level: -1
+then we get a common Tensor
+    Out.data = [[a], [a], [b], [c], [c], [c]]
     Out.dims = [6, 1]
 
-Case 3:
+Case 4:
 
 Given a common Tensor input(X)
     X.data = [[a, b], [c, d], [e, f]]
     X.dims = [3, 2]
 and input(Y)
     Y.lod = [[0, 2, 3, 6]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 1-level LoDTensor
-    Out.lod = [[0,           2,     3,                     6]]
-    Out.data = [[a,b], [a,b] [c,d], [e, f], [e, f], [e, f]]
+ref_level: 0
+then we get a common LoDTensor
+    Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
     Out.dims = [6, 2]
 
-Case 4:
-
-Given 2-level a LoDTensor input(X)
-    X.lod = [[0,       2, 3],
-             [0, 1,    3, 4]]
-    X.data = [a, b, c, d]
-    X.dims = [4, 1]
-and input(Y)
-    Y.lod = [[0,    2,    4],
-             [0, 3, 6, 6, 8]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 2-level LoDTensor
-    Out.lod = [[0,                2,    4],
-               [0,       3,       6, 6, 8]]
-    Out.data = [a, a, a, b, b, b, d, d]
-    Out.dims = [8, 1]
-
-
 )DOC");
   }
 };
@@ -129,12 +183,14 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasInput("Out"));
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "The input(Out@GRAD) should not be null");
+                   "Input(Out@GRAD) should not be null.");
+
     auto x_dims = ctx->GetInputDim("X");
     auto x_grad_name = framework::GradVarName("X");
+
     if (ctx->HasOutput(x_grad_name)) {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
@@ -149,7 +205,13 @@ REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
             sequence_expand_grad, ops::SequenceExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand,
-    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand_grad,
-    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 26622d23afa1c703e237628bcb11db8f1da73210..bb51bb2902eea797de3449fcb6c8b52b4f0e7fbf 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -18,7 +18,14 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     sequence_expand,
-    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_expand_grad,
-    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 76dde976db2d19e307ae7406be8280f9b4987187..db7d8bd6821fabd9714a160970558291ec47197f 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -16,45 +16,75 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 class SequenceExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    const T* x_data = x->data<T>();
-    auto x_dims = x->dims();
     auto* y = context.Input<LoDTensor>("Y");
-    PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
-                      y->lod().back().size() - 1,
-                      "The size of last lod level in Input(Y)"
-                      "must be equal to dims[0] of Input(X).");
-    out->set_lod(y->lod());
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    size_t element_len = framework::product(x_dims) / x_dims[0];
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_starts = out->lod().back();
-
-    for (size_t i = 0; i < out_starts.size() - 1; i++) {
-      int scale = out_starts[i + 1] - out_starts[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          x_t(x_data, 1, element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          out_t(out_data, scale, element_len);
-      Eigen::array<int, 2> cast({{scale, 1}});
-      out_t.device(*place) = x_t.broadcast(cast);
-      x_data += element_len;
-      out_data += element_len * scale;
+    auto* out = context.Output<LoDTensor>("Out");
+
+    int ref_level = context.Attr<int>("ref_level");
+    auto& x_lod = x->lod();
+    auto& y_lod = y->lod();
+
+    if (ref_level == -1) ref_level = y_lod.size() - 1;
+
+    out->mutable_data<T>(context.GetPlace());
+
+    if (y_lod[ref_level].size() <= 1) {
+      framework::TensorCopy(*x, context.GetPlace(), out);
+      return;
+    }
+
+    auto& out_lod = *out->mutable_lod();
+    if (x_lod.size() == 1) {
+      out_lod.resize(1);
+      out_lod[0] = {0};
+    }
+
+    int out_offset = 0;
+    auto& eigen_place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+      int x_start = i - 1;
+      int x_end = i;
+      if (x_lod.size() == 1) {
+        x_start = x_lod[0][i - 1];
+        x_end = x_lod[0][i];
+      }
+      int x_seq_len = x_end - x_start;
+      if (repeat_num > 0) {
+        auto x_sub_tensor = x->Slice(x_start, x_end);
+        x_sub_tensor.Resize({1, x_sub_tensor.numel()});
+        int out_start = out_offset;
+        if (x_lod.size() == 1) {
+          out_start = out_lod[0][out_offset];
+        }
+        auto out_sub_tensor =
+            out->Slice(out_start, out_start + x_seq_len * repeat_num);
+        out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
+        EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
+            EigenMatrix<T>::From(x_sub_tensor)
+                .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
+      }
+      for (int j = 0; j < repeat_num; ++j) {
+        if (x_lod.size() == 1) {
+          out_lod[0].push_back(out_lod[0].back() + x_seq_len);
+        }
+        out_offset++;
+      }
     }
   }
 };
@@ -75,27 +105,51 @@ template <typename DeviceContext, typename T>
 class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* g_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Input<LoDTensor>("Out");
-    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto out_last_level = out->lod().back();
-    d_x->set_lod(x->lod());
-    const T* d_out_data = d_out->data<T>();
-    T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-    size_t element_len = d_out->numel() / d_out->dims()[0];
-    for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
-      size_t repeat = out_last_level[i + 1] - out_last_level[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_out_t(d_out_data, static_cast<int>(repeat), element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_x_t(d_x_data, static_cast<int>(element_len));
-      auto place =
-          context.template device_context<DeviceContext>().eigen_device();
-      d_x_t.device(*place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
-      d_out_data += (repeat * element_len);
-      d_x_data += element_len;
+    auto* y = context.Input<LoDTensor>("Y");
+    auto* g_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+    int ref_level = context.Attr<int>("ref_level");
+
+    g_x->mutable_data<T>(context.GetPlace());
+    g_x->set_lod(x->lod());
+
+    auto& x_lod = x->lod();
+    auto& y_lod = y->lod();
+
+    if (ref_level == -1) ref_level = y_lod.size() - 1;
+
+    // just copy the gradient
+    if (y_lod[ref_level].size() <= 1) {
+      framework::TensorCopy(*g_out, context.GetPlace(), g_x);
+      return;
+    }
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, g_x, static_cast<T>(0));
+
+    int g_out_offset = 0;
+    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+      if (repeat_num > 0) {
+        int x_start = i - 1;
+        int x_end = i;
+        if (x_lod.size() == 1) {
+          x_start = x_lod[0][i - 1];
+          x_end = x_lod[0][i];
+        }
+        int x_seq_len = x_end - x_start;
+        auto g_x_sub = g_x->Slice(x_start, x_end);
+        g_x_sub.Resize(flatten_to_1d(g_x_sub.dims()));
+        int g_out_end = g_out_offset + repeat_num * x_seq_len;
+        auto g_out_sub = g_out->Slice(g_out_offset, g_out_end);
+        g_out_sub.Resize({repeat_num, g_x_sub.dims()[0]});
+        math::ColwiseSum<DeviceContext, T> col_sum;
+        col_sum(dev_ctx, g_out_sub, &g_x_sub);
+        g_out_offset += repeat_num * x_seq_len;
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 47cb336d87f8627d86ac33d6ac32c04d5d93f753..5596fa0648ccc151bc0d11de9c556599428a8d71 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -56,7 +56,9 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(softmax, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::SoftmaxCUDNNKernel<float>);
-REGISTER_OP_KERNEL(softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
+namespace plat = paddle::platform;
+REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
+                   ops::SoftmaxCUDNNKernel<float>,
+                   ops::SoftmaxCUDNNKernel<plat::float16>);
+REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
                    ops::SoftmaxGradCUDNNKernel<float>);
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf0244e8662e827a90d8472a097315680579ff6d
--- /dev/null
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+#include <iostream>
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::MKLDNNMemDesc;
+
+using mkldnn::memory;  // Note: paddle has also "memory" namespace
+using mkldnn::primitive;
+using mkldnn::softmax_forward;
+using mkldnn::prop_kind;
+using mkldnn::stream;
+
+template <typename T>
+class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* input = ctx.Input<Tensor>("X");
+    Tensor* output = ctx.Output<Tensor>("Out");
+    PADDLE_ENFORCE(input->dims().size() == 2UL,
+                   "The input of softmax op must be a 2D matrix.");
+    const T* input_data = input->data<T>();
+    // allocate memory for output
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
+    // we will make normalization after final eg. axis: 1
+    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
+                   "Softmax input and output dimensions should match");
+    // Same memory descriptor to be used for input and output
+    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
+    // Currently only supports NC data format
+    // TODO(jczaja-intel): support more formats
+    auto softmax_md =
+        MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
+                                              softmax_md, 1 /*dim: C*/);
+    // create memory primitives
+    auto softmax_src_memory =
+        memory({softmax_md, mkldnn_engine}, (void*)input_data);
+    auto softmax_dst_memory =
+        memory({softmax_md, mkldnn_engine}, (void*)output_data);
+    auto softmax_prim_desc =
+        softmax_forward::primitive_desc(softmax_desc, mkldnn_engine);
+    auto softmax = softmax_forward(softmax_prim_desc, softmax_src_memory,
+                                   softmax_dst_memory);
+    std::vector<primitive> pipeline{softmax};
+    stream(stream::kind::eager).submit(pipeline).wait();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::SoftmaxMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 1b63f8a499e5d20d2f10c3cd1024d1bcf78764d4..e2c0f915d96b7746191572fa27b725d90cb6e2e5 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -13,7 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_op.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
 
@@ -38,26 +44,32 @@ class SoftmaxOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
-    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-    bool runtime_cudnn_support = false;
+    framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
     }
 #endif
-    framework::LibraryType library_ = framework::LibraryType::kPlain;
-    if (use_cudnn && runtime_cudnn_support) {
-      library_ = framework::LibraryType::kCUDNN;
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
     }
+#endif
+
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
+                        "float16 can only be used when CUDNN is used");
+    }
+
     std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        framework::StringToDataLayout(data_format), library_);
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                   framework::StringToDataLayout(data_format),
+                                   library_);
   }
 };
-
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
@@ -77,6 +89,9 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
         "Defaults to \"NHWC\". Specify the data format of the output data, "
         "the input will be transformed automatically. ")
         .SetDefault("AnyLayout");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Softmax Operator.
 
@@ -119,19 +134,12 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
-    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-    bool runtime_cudnn_support = false;
+    framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
-    }
-#endif
-    framework::LibraryType library_ = framework::LibraryType::kPlain;
-    if (use_cudnn && runtime_cudnn_support) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
+#endif
     std::string data_format = ctx.Attr<std::string>("data_format");
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
diff --git a/paddle/fluid/operators/target_assign_op.cc b/paddle/fluid/operators/target_assign_op.cc
index a894b12fa35a121eff0b8f9d2d0eecc5ae5185f3..33ff967e5e8f5afbaa62ba39ce596687ae0a71cd 100644
--- a/paddle/fluid/operators/target_assign_op.cc
+++ b/paddle/fluid/operators/target_assign_op.cc
@@ -153,8 +153,8 @@ template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp,
-                             ops::TargetAssignOpMaker);
+REGISTER_OPERATOR(target_assign, ops::TargetAssignOp, ops::TargetAssignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
     target_assign,
     ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, int, float>,
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 7eec6ab657723c6390dfa14a78d6c49a76f2a279..686c0889140f0050b37192542ca98e2f3e5f23df 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
 
-cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
+cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 7e001ecc56173db76e8c576e7efd66f41192f292..7c604e14eb245232ed92f53a00b9bde45c2fbaec 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -86,7 +86,8 @@ class CudnnDataType<float16> {
  public:
   static const cudnnDataType_t type = CUDNN_DATA_HALF;
   // The scaling param type is float for HALF and FLOAT tensors
-  typedef const float ScalingParamType;
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
   static ScalingParamType* kOne() {
     static ScalingParamType v = 1.0;
     return &v;
@@ -101,7 +102,8 @@ template <>
 class CudnnDataType<float> {
  public:
   static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
-  typedef const float ScalingParamType;
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
   static ScalingParamType* kOne() {
     static ScalingParamType v = 1.0;
     return &v;
@@ -116,7 +118,8 @@ template <>
 class CudnnDataType<double> {
  public:
   static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
-  typedef const double ScalingParamType;
+  using ScalingParamType = const double;
+  using BatchNormParamType = double;
   static ScalingParamType* kOne() {
     static ScalingParamType v = 1.0;
     return &v;
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index 52fb8c2531357ad7a2b2f8613e5c7fbcef52c6bb..2cf311c7e56a9bbb0bdb0078d5cfefb4bb50018b 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -483,9 +483,124 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 
 #endif  // PADDLE_CUDA_FP16
 
-// Arithmetic operators on ARMv8.2-A CPU
-#if defined(PADDLE_WITH_NATIVE_FP16)
-HOST inline float16 operator+(const float16& a, const float16& b) {
+// Arithmetic operators for float16 on GPU
+#if defined(PADDLE_CUDA_FP16)
+HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hadd(half(a), half(b)));
+#else
+  return float16(float(a) + float(b));
+#endif
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hsub(half(a), half(b)));
+#else
+  return float16(float(a) - float(b));
+#endif
+}
+
+HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hmul(half(a), half(b)));
+#else
+  return float16(float(a) * float(b));
+#endif
+}
+
+HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  // TODO(kexinzhao): check which cuda version starts to support __hdiv
+  float num = __half2float(half(a));
+  float denom = __half2float(half(b));
+  return float16(num / denom);
+#else
+  return float16(float(a) / float(b));
+#endif
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hneg(half(a)));
+#else
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+#endif
+}
+
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(half(a), half(b));
+#else
+  return float(a) == float(b);
+#endif
+}
+
+HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(half(a), half(b));
+#else
+  return float(a) != float(b);
+#endif
+}
+
+HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(half(a), half(b));
+#else
+  return float(a) < float(b);
+#endif
+}
+
+HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(half(a), half(b));
+#else
+  return float(a) <= float(b);
+#endif
+}
+
+HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(half(a), half(b));
+#else
+  return float(a) > float(b);
+#endif
+}
+
+HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(half(a), half(b));
+#else
+  return float(a) >= float(b);
+#endif
+}
+
+// Arithmetic operators for float16 on ARMv8.2-A CPU
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+inline float16 operator+(const float16& a, const float16& b) {
   float16 res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -501,7 +616,7 @@ HOST inline float16 operator+(const float16& a, const float16& b) {
   return res;
 }
 
-HOST inline float16 operator-(const float16& a, const float16& b) {
+inline float16 operator-(const float16& a, const float16& b) {
   float16 res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -517,7 +632,7 @@ HOST inline float16 operator-(const float16& a, const float16& b) {
   return res;
 }
 
-HOST inline float16 operator*(const float16& a, const float16& b) {
+inline float16 operator*(const float16& a, const float16& b) {
   float16 res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -533,7 +648,7 @@ HOST inline float16 operator*(const float16& a, const float16& b) {
   return res;
 }
 
-HOST inline float16 operator/(const float16& a, const float16& b) {
+inline float16 operator/(const float16& a, const float16& b) {
   float16 res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -549,7 +664,7 @@ HOST inline float16 operator/(const float16& a, const float16& b) {
   return res;
 }
 
-HOST inline float16 operator-(const float16& a) {
+inline float16 operator-(const float16& a) {
   float16 res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -564,27 +679,27 @@ HOST inline float16 operator-(const float16& a) {
   return res;
 }
 
-HOST inline float16& operator+=(float16& a, const float16& b) {
+inline float16& operator+=(float16& a, const float16& b) {
   a = a + b;
   return a;
 }
 
-HOST inline float16& operator-=(float16& a, const float16& b) {
+inline float16& operator-=(float16& a, const float16& b) {
   a = a - b;
   return a;
 }
 
-HOST inline float16& operator*=(float16& a, const float16& b) {
+inline float16& operator*=(float16& a, const float16& b) {
   a = a * b;
   return a;
 }
 
-HOST inline float16& operator/=(float16& a, const float16& b) {
+inline float16& operator/=(float16& a, const float16& b) {
   a = a / b;
   return a;
 }
 
-HOST inline bool operator==(const float16& a, const float16& b) {
+inline bool operator==(const float16& a, const float16& b) {
   uint16_t res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -600,11 +715,9 @@ HOST inline bool operator==(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-HOST inline bool operator!=(const float16& a, const float16& b) {
-  return !(a == b);
-}
+inline bool operator!=(const float16& a, const float16& b) { return !(a == b); }
 
-HOST inline bool operator<(const float16& a, const float16& b) {
+inline bool operator<(const float16& a, const float16& b) {
   uint16_t res;
   asm volatile(
       "ld1 {v1.h}[0], [%[a_ptr]]\n"
@@ -620,7 +733,7 @@ HOST inline bool operator<(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-HOST inline bool operator<=(const float16& a, const float16& b) {
+inline bool operator<=(const float16& a, const float16& b) {
   uint16_t res;
   asm volatile(
       "ld1 {v1.h}[0], [%[a_ptr]]\n"
@@ -636,7 +749,7 @@ HOST inline bool operator<=(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-HOST inline bool operator>(const float16& a, const float16& b) {
+inline bool operator>(const float16& a, const float16& b) {
   uint16_t res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -652,7 +765,7 @@ HOST inline bool operator>(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-HOST inline bool operator>=(const float16& a, const float16& b) {
+inline bool operator>=(const float16& a, const float16& b) {
   uint16_t res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -668,71 +781,71 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-// Arithmetic operators, software emulated on other CPU
+// Arithmetic operators for float16, software emulated on other CPU
 #else
-HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+inline float16 operator+(const float16& a, const float16& b) {
   return float16(float(a) + float(b));
 }
 
-HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+inline float16 operator-(const float16& a, const float16& b) {
   return float16(float(a) - float(b));
 }
 
-HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+inline float16 operator*(const float16& a, const float16& b) {
   return float16(float(a) * float(b));
 }
 
-HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+inline float16 operator/(const float16& a, const float16& b) {
   return float16(float(a) / float(b));
 }
 
-HOSTDEVICE inline float16 operator-(const float16& a) {
+inline float16 operator-(const float16& a) {
   float16 res;
   res.x = a.x ^ 0x8000;
   return res;
 }
 
-HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+inline float16& operator+=(float16& a, const float16& b) {
   a = float16(float(a) + float(b));
   return a;
 }
 
-HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+inline float16& operator-=(float16& a, const float16& b) {
   a = float16(float(a) - float(b));
   return a;
 }
 
-HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+inline float16& operator*=(float16& a, const float16& b) {
   a = float16(float(a) * float(b));
   return a;
 }
 
-HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+inline float16& operator/=(float16& a, const float16& b) {
   a = float16(float(a) / float(b));
   return a;
 }
 
-HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+inline bool operator==(const float16& a, const float16& b) {
   return float(a) == float(b);
 }
 
-HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+inline bool operator!=(const float16& a, const float16& b) {
   return float(a) != float(b);
 }
 
-HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+inline bool operator<(const float16& a, const float16& b) {
   return float(a) < float(b);
 }
 
-HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+inline bool operator<=(const float16& a, const float16& b) {
   return float(a) <= float(b);
 }
 
-HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+inline bool operator>(const float16& a, const float16& b) {
   return float(a) > float(b);
 }
 
-HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+inline bool operator>=(const float16& a, const float16& b) {
   return float(a) >= float(b);
 }
 #endif
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 030458f70a6630de56ef896a5730112a018ac0c3..de9a5cc20d76bf84778e0933831f218abb66c465 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -125,15 +125,11 @@ struct RecordBlock {
  private:
   std::string name_;
   uint64_t start_ns_;
-  int block_id_;
 };
 
 struct RecordThread {
   explicit RecordThread(int thread_id);
   ~RecordThread();
-
- private:
-  uint64_t start_ns_;
 };
 
 // Return the event list of all threads. Assumed the returned value calls
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index ecf9e47884990e1e8a07fe7ab0e884f0c94438c9..ada69ea4a425f70dc085ad9046bb6b930136803d 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,10 +1,18 @@
 if(WITH_PYTHON)
-  cc_library(paddle_pybind SHARED
-    SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
-         parallel_executor
-    ${GLOB_OP_LIB})
-  if(NOT APPLE AND NOT ANDROID)
-    target_link_libraries(paddle_pybind rt)
-  endif(NOT APPLE AND NOT ANDROID)
+  if(WITH_AMD_GPU)
+    hip_library(paddle_pybind SHARED
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+           parallel_executor
+      ${GLOB_OP_LIB})
+  else()
+    cc_library(paddle_pybind SHARED
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+      DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+           parallel_executor
+      ${GLOB_OP_LIB})
+    if(NOT APPLE AND NOT ANDROID)
+      target_link_libraries(paddle_pybind rt)
+    endif(NOT APPLE AND NOT ANDROID)
+  endif(WITH_AMD_GPU)
 endif(WITH_PYTHON)
diff --git a/paddle/fluid/recordio/header.cc b/paddle/fluid/recordio/header.cc
index e50de15b7c2b480357f5f6c7daa2b4a676749679..ed09d58f6a3e2dba50bf4407c0463480575b248e 100644
--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
@@ -29,8 +29,8 @@ Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs)
 
 bool Header::Parse(std::istream& is) {
   uint32_t magic;
-  size_t read_size =
-      is.readsome(reinterpret_cast<char*>(&magic), sizeof(uint32_t));
+  is.read(reinterpret_cast<char*>(&magic), sizeof(uint32_t));
+  size_t read_size = is.gcount();
   if (read_size < sizeof(uint32_t)) {
     return false;
   }
diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc
index d842f8fe5a4c9d1a2b564c738d97fffb02f3ccb5..c22281dc97e05173ad76ce76959833b92f11c4ee 100644
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -28,6 +28,7 @@ Scanner::Scanner(const std::string &filename) {
 }
 
 void Scanner::Reset() {
+  stream_->clear();
   stream_->seekg(0, std::ios::beg);
   ParseNextChunk();
 }
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
old mode 100644
new mode 100755
index 6be2bd8fad9e33cf4e1dcafdd6b8f39111bdbe88..322f72e4a58c7e8f2c26d994477cbb55551c595a
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -35,8 +35,9 @@ function cmake_gen() {
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
         ${PYTHON_FLAGS}
         -DWITH_DSO=ON
-        -DWITH_DOC=OFF
+        -DWITH_DOC=${WITH_DOC:-OFF}
         -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
@@ -50,6 +51,7 @@ function cmake_gen() {
         -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
         -DWITH_TESTING=${WITH_TESTING:-ON}
         -DWITH_FAST_BUNDLE_TEST=ON
+        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
     ========================================
 EOF
@@ -60,8 +62,9 @@ EOF
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
         ${PYTHON_FLAGS} \
         -DWITH_DSO=ON \
-        -DWITH_DOC=OFF \
+        -DWITH_DOC=${WITH_DOC:-OFF} \
         -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
@@ -74,6 +77,7 @@ EOF
         -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
         -DWITH_FAST_BUNDLE_TEST=ON \
+        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 }
 
@@ -231,7 +235,7 @@ gen_capi_package
 gen_fluid_inference_lib
 
 if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
-  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" 
+  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n"
 else
   printf "If you need to install PaddlePaddle in develop docker image,"
   printf "please make install or pip install build/python/dist/*.whl.\n"
diff --git a/paddle/scripts/tools/build_docs/.gitignore b/paddle/scripts/tools/build_docs/.gitignore
deleted file mode 100644
index 6ec14c8f5bc3774a81dbe87c44f458594b38f12c..0000000000000000000000000000000000000000
--- a/paddle/scripts/tools/build_docs/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-doc
-doc_cn
diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh
deleted file mode 100755
index f9bc8bf63ae9afdfca1ff660bc83e62e71f03005..0000000000000000000000000000000000000000
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-docker run --rm \
-       -v $(git rev-parse --show-toplevel):/paddle \
-       -e "WITH_GPU=OFF" \
-       -e "WITH_AVX=ON" \
-       -e "WITH_DOC=ON" \
-       -e "WOBOQ=ON" \
-       ${1:-"paddlepaddle/paddle:latest-dev"}
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index 535e881c42f675198a2679cb7974af64b65cc194..d65e1a6858373d8e172cb8112a10a77d2e9bd5bc 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from layers.control_flow import BlockGuard, Select
+from layers.control_flow import BlockGuard, equal
+from .framework import Operator
 from layer_helper import LayerHelper, unique_name
 from layers import fill_constant
 import core
@@ -75,6 +76,185 @@ class Go(BlockGuard):
             attrs={'sub_block': go_block})
 
 
+class SelectCase(object):
+    DEFAULT = 0
+    SEND = 1
+    RECEIVE = 2
+
+    def __init__(self,
+                 case_idx,
+                 case_to_execute,
+                 channel_action_fn=None,
+                 channel=None,
+                 value=None):
+        self.helper = LayerHelper('conditional_block')
+        self.main_program = self.helper.main_program
+        self.is_scalar_condition = True
+
+        self.case_to_execute = case_to_execute
+        self.idx = case_idx
+
+        # Since we aren't going to use the `channel_send` or `channel_recv`
+        # functions directly, we just need to capture the name.
+        self.action = (self.SEND
+                       if channel_action_fn.__name__ == ('channel_send') else
+                       self.RECEIVE) if channel_action_fn else self.DEFAULT
+        self.value = value
+        self.channel = channel
+
+    def __enter__(self):
+        self.block = self.main_program.create_block()
+
+    def construct_op(self):
+        main_program = self.helper.main_program
+        cases_block = main_program.current_block()
+
+        inner_outputs = set()
+        input_set = set()
+        params = set()
+
+        for op in self.block.ops:
+            # Iterate over all operators, get all the inputs
+            # and add as input to the SelectCase operator.
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        input_set.add(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+
+        param_list = [
+            cases_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        # Iterate over all operators, get all the outputs
+        # add to the output list of SelectCase operator only if
+        # they exist in the parent block.
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in cases_block.vars:
+                out_vars.append(cases_block.var(inner_out_name))
+
+        # First, create an op that will determine whether or not this is the
+        # conditional variable to execute.
+        should_execute_block = equal(
+            fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
+            self.case_to_execute)
+
+        step_scope = cases_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        cases_block.append_op(
+            type='conditional_block',
+            inputs={'X': [should_execute_block],
+                    'Params': param_list},
+            outputs={'Out': out_vars,
+                     'Scope': [step_scope]},
+            attrs={
+                'sub_block': self.block,
+                'is_scalar_condition': self.is_scalar_condition
+            })
+
+        return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
+                                if self.channel else '', self.value.name
+                                if self.value else '')
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.main_program.rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+
+
+class Select(BlockGuard):
+    def __init__(self, name=None):
+        self.helper = LayerHelper('select', name=name)
+        self.cases = []
+
+        super(Select, self).__init__(self.helper.main_program)
+        self.case_to_execute = fill_constant(
+            shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
+
+    def __enter__(self):
+        super(Select, self).__enter__()
+        return self
+
+    def case(self, channel_action_fn, channel, value):
+        """Create a new block for this condition.
+        """
+        select_case = SelectCase(
+            len(self.cases), self.case_to_execute, channel_action_fn, channel,
+            value)
+
+        self.cases.append(select_case)
+
+        return select_case
+
+    def default(self):
+        """Create a default case block for this condition.
+        """
+        default_case = SelectCase(len(self.cases), self.case_to_execute)
+
+        self.cases.append(default_case)
+
+        return default_case
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+
+        # Create a select op and another block to wrap its
+        # case blocks.
+        select_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(select_block.parent_idx)
+
+        # Construct each case op, inside the newly created select block.
+        serialized_cases = []
+        for case in self.cases:
+            serialized_cases.append(case.construct_op())
+
+        intermediate = set()
+        params = set()
+
+        for case_block in select_block.ops:
+            if case_block.attrs and 'sub_block' in case_block.attrs:
+                for each_op in case_block.attrs['sub_block'].ops:
+                    assert isinstance(each_op, Operator)
+                    for iname in each_op.input_names:
+                        for in_var_name in each_op.input(iname):
+                            if in_var_name not in intermediate:
+                                params.add(in_var_name)
+
+                    for oname in each_op.output_names:
+                        for out_var_name in each_op.output(oname):
+                            intermediate.add(out_var_name)
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name in intermediate
+        ]
+
+        X = [select_block.var_recursive(x_name) for x_name in params]
+
+        # Needs to be used by `equal` inside the cases block.
+        X.append(self.case_to_execute)
+
+        # Construct the select op.
+        parent_block.append_op(
+            type='select',
+            inputs={'X': X,
+                    'case_to_execute': self.case_to_execute},
+            attrs={'sub_block': select_block,
+                   'cases': serialized_cases},
+            outputs={'Out': out_list})
+
+        return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
+
+
 def make_channel(dtype, capacity=0):
     """
     Helps implementation of a concurrent program by creating a "channel" of
@@ -131,7 +311,7 @@ def make_channel(dtype, capacity=0):
     return channel
 
 
-def channel_send(channel, value):
+def channel_send(channel, value, is_copy=False):
     """
     Sends a value through a channel variable. Used by an unbuffered or buffered
     channel to pass data from within or to a concurrent Go block, where
@@ -141,6 +321,8 @@ def channel_send(channel, value):
         channel (Variable|Channel): Channel variable created using
         `make_channel`.
         value (Variable): Value to send to channel
+        is_copy (bool): Copy data while channel send. If False, then data
+        is moved. The input cannot be used after move. (default False)
     Returns:
         Variable: The boolean status on whether or not the channel
                   successfully sent the passed value.
@@ -162,11 +344,26 @@ def channel_send(channel, value):
         type=core.VarDesc.VarType.LOD_TENSOR,
         dtype=core.VarDesc.VarType.BOOL)
 
+    X = value
+
+    if is_copy is True:
+        copied_X = helper.create_variable(
+            name=unique_name.generate(value.name + '_copy'),
+            type=value.type,
+            dtype=value.dtype,
+            shape=value.shape,
+            lod_level=value.lod_level,
+            capacity=value.capacity)
+
+        assign_op = channel_send_block.append_op(
+            type="assign_op", inputs={"X": value}, outputs={"Out": copied_X})
+        X = copied_X
+
     channel_send_op = channel_send_block.append_op(
         type="channel_send",
         inputs={
             "Channel": channel,
-            "X": value,
+            "X": X,
         },
         outputs={"Status": status})
 
diff --git a/python/paddle/fluid/debuger.py b/python/paddle/fluid/debuger.py
index 97fa182c4007cc730c06e9f95259a2509e01ecdf..7b4afa9bf65e1369329cd4648c1f5c4bd8fa8357 100644
--- a/python/paddle/fluid/debuger.py
+++ b/python/paddle/fluid/debuger.py
@@ -16,7 +16,6 @@ import sys
 import re
 from graphviz import GraphPreviewGenerator
 import proto.framework_pb2 as framework_pb2
-import paddle.fluid.core as core
 
 _vartype2str_ = [
     "UNK",
@@ -126,7 +125,6 @@ def pprint_block_codes(block_desc, show_backward=False):
     def is_var_backward(var_desc):
         return "@GRAD" in var_desc.name
 
-    #print(type(block_desc))
     if type(block_desc) is not framework_pb2.BlockDesc:
         block_desc = framework_pb2.BlockDesc.FromString(
             block_desc.serialize_to_string())
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 3d3a6c116eeb39fb7236d0e9707415cdd6b828bd..62147d325b699a62bd39cfbaca44874b7fc19a0f 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -20,6 +20,7 @@ from layer_helper import LayerHelper
 from distributed_spliter import *
 import math
 from . import core
+import debuger
 
 
 class VarBlock:
@@ -289,6 +290,7 @@ class DistributeTranspiler:
                     dtype=v.dtype,
                     shape=v.shape)
                 recv_inputs.append(var)
+
         # step3
         optimize_block = pserver_program.create_block(0)
         # step 4
@@ -307,15 +309,57 @@ class DistributeTranspiler:
         # Iterate through the ops, and if an op and the optimize ops
         # which located on current pserver are in one set, then 
         # append it into the sub program.
-        for _, op in enumerate(self.optimize_ops):
-            for _, opt_op in enumerate(opt_op_on_pserver):
-                if ufind.is_connected(op, opt_op):
-                    if self._is_opt_op(op):
-                        self._append_pserver_ops(optimize_block, op, endpoint,
-                                                 default_main_program())
-                    else:
-                        self._append_pserver_non_opt_ops(optimize_block, op)
-                    break
+
+        # We try to put optimization program run parallelly, assume
+        # optimization program always looks like:
+        #
+        # prevop -> prevop -> opt op -> following op -> following op; ->
+        # prevop -> prevop -> opt op -> following op -> following op; ->
+        # global op -> global op
+        #
+        # we put operators that can run parallelly to many program blocks.
+        # in above example, we seperate ops by the ";". Global ops must run
+        # after all the optimize ops finished.
+
+        global_ops = []
+        # HACK: optimization global ops only used to scale beta1 and beta2
+        # replace it with dependency engine.
+        for op in self.optimize_ops:
+            if op.type == "scale":
+                for in_name in op.input_arg_names:
+                    if in_name.startswith("beta1_pow_acc") or\
+                        in_name.startswith("beta2_pow_acc"):
+                        global_ops.append(op)
+
+        def __append_optimize_op__(op, block):
+            if self._is_opt_op(op):
+                self._append_pserver_ops(block, op, endpoint,
+                                         default_main_program())
+            else:
+                self._append_pserver_non_opt_ops(block, op)
+
+        # append op to the current block
+        per_opt_block = optimize_block
+        for _, opt_op in enumerate(opt_op_on_pserver):
+            for _, op in enumerate(self.optimize_ops):
+                # optimizer is connected to itself
+                if ufind.is_connected(op, opt_op) and \
+                    op not in global_ops:
+                    __append_optimize_op__(op, per_opt_block)
+            per_opt_block = pserver_program.create_block(0)
+
+        # append global ops
+        for glb_op in global_ops:
+            __append_optimize_op__(glb_op, per_opt_block)
+
+        # NOT USED: single block version:
+        #
+        # for _, op in enumerate(self.optimize_ops):
+        #     for _, opt_op in enumerate(opt_op_on_pserver):
+        #         if ufind.is_connected(op, opt_op):
+        #             __append_optimize_op__(glb_op, optimize_block)
+        #             break
+
         # step5 append the listen_and_serv op
         pserver_program.global_block().append_op(
             type="listen_and_serv",
@@ -521,6 +565,8 @@ class DistributeTranspiler:
         orig_var_name = ""
         if suff_idx >= 0:
             orig_var_name = varname[:suff_idx]
+        else:
+            orig_var_name = varname
         return orig_var_name
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
@@ -535,7 +581,8 @@ class DistributeTranspiler:
                 grad_block = None
                 for g in self.param_grad_ep_mapping[endpoint]["grads"]:
                     if same_or_split_var(
-                            self._orig_varname(g.name), opt_op.input(key)[0]):
+                            self._orig_varname(g.name),
+                            self._orig_varname(opt_op.input(key)[0])):
                         grad_block = g
                         break
                 if not grad_block:
@@ -660,10 +707,22 @@ class DistributeTranspiler:
         # If one op's input is another op's output or
         # one op's output is another op's input, we say
         # the two operator is connected.
-        op1_input_names = op1.desc.input_arg_names()
+        def _append_inname_remove_beta(varname_list):
+            op_input_names = []
+            for in_name in varname_list:
+                # HACK: remove beta1 and beta2 to avoid let all
+                # ops connected.
+                if in_name.startswith("beta2_pow_acc") or \
+                    in_name.startswith("beta1_pow_acc"):
+                    continue
+                else:
+                    op_input_names.append(in_name)
+            return op_input_names
+
+        op1_input_names = _append_inname_remove_beta(op1.desc.input_arg_names())
         op1_output_names = op1.desc.output_arg_names()
 
-        op2_input_names = op2.desc.input_arg_names()
+        op2_input_names = _append_inname_remove_beta(op2.desc.input_arg_names())
         op2_output_names = op2.desc.output_arg_names()
 
         if set(op1_output_names) & set(op2_input_names) or \
@@ -694,7 +753,7 @@ class DistributeTranspiler:
         param_names = [
             p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
         ]
-        if op.input("Param") in param_names:
+        if op.input("Param")[0] in param_names:
             return True
         else:
             for n in param_names:
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 4490f2bf153f672464ec8bca2a44109c9fe0dd04..2612fb1ae41986ae0d5c6e942cc3accebcb00e19 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -235,6 +235,77 @@ class Executor(object):
             tensor.set_lod(lod)
             return tensor
 
+    def _get_program_cache(self, program_cache_key):
+        return self.program_caches.get(program_cache_key, None)
+
+    def _add_program_cache(self, program_cache_key, program):
+        self.program_caches[program_cache_key] = program
+
+    def _add_feed_fetch_ops(self, program, feed, fetch_list, feed_var_name,
+                            fetch_var_name):
+        tmp_program = program.clone()
+
+        global_block = tmp_program.global_block()
+
+        if feed_var_name in global_block.vars:
+            feed_var = global_block.var(feed_var_name)
+        else:
+            feed_var = global_block.create_var(
+                name=feed_var_name,
+                type=core.VarDesc.VarType.FEED_MINIBATCH,
+                persistable=True)
+
+        if fetch_var_name in global_block.vars:
+            fetch_var = global_block.var(fetch_var_name)
+        else:
+            fetch_var = global_block.create_var(
+                name=fetch_var_name,
+                type=core.VarDesc.VarType.FETCH_LIST,
+                persistable=True)
+
+        # prepend feed operators
+        if not has_feed_operators(global_block, feed, feed_var_name):
+            for i, name in enumerate(feed):
+                out = global_block.var(name)
+                global_block.prepend_op(
+                    type='feed',
+                    inputs={'X': [feed_var]},
+                    outputs={'Out': [out]},
+                    attrs={'col': i})
+
+        # append fetch_operators
+        if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
+            for i, var in enumerate(fetch_list):
+                assert isinstance(var, Variable) or isinstance(var, str), (
+                    "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
+                global_block.append_op(
+                    type='fetch',
+                    inputs={'X': [var]},
+                    outputs={'Out': [fetch_var]},
+                    attrs={'col': i})
+
+        return tmp_program
+
+    def _feed_data(self, program, feed, feed_var_name, scope):
+        # feed var to framework
+        for op in program.global_block().ops:
+            if op.desc.type() == 'feed':
+                feed_target_name = op.desc.output('Out')[0]
+                cur_feed = feed[feed_target_name]
+                if not isinstance(cur_feed, core.LoDTensor):
+                    cur_feed = self.aslodtensor(cur_feed)
+                idx = op.desc.attr('col')
+                core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
+            else:
+                break
+
+    def _fetch_data(self, fetch_list, fetch_var_name, scope):
+        outs = [
+            core.get_fetch_variable(scope, fetch_var_name, i)
+            for i in xrange(len(fetch_list))
+        ]
+        return outs
+
     def run(self,
             program=None,
             feed=None,
@@ -268,7 +339,6 @@ class Executor(object):
             raise TypeError("feed should be a map")
         if fetch_list is None:
             fetch_list = []
-
         if program is None:
             program = default_main_program()
 
@@ -278,79 +348,30 @@ class Executor(object):
         if scope is None:
             scope = global_scope()
 
-        program_cache = None
-        program_cache_key = get_program_cache_key(feed, fetch_list)
-
+        cache_key = get_program_cache_key(feed, fetch_list)
         if use_program_cache:
-            # find program cache by cache_key
-            program_cache = self.program_caches.get(program_cache_key, None)
-            # TODO(qiao): Should check program_cache and program are exactly the same.
+            cached_program = self._get_program_cache(cache_key)
+            if cached_program is None:
+                cached_program = self._add_feed_fetch_ops(
+                    program=program,
+                    feed=feed,
+                    fetch_list=fetch_list,
+                    feed_var_name=feed_var_name,
+                    fetch_var_name=fetch_var_name)
+                self._add_program_cache(cache_key, cached_program)
+            program = cached_program
         else:
-            self.program_caches.pop(program_cache_key, None)
-
-        if program_cache is None:
-            program_cache = program.clone()
-
-            if use_program_cache:
-                self.program_caches[program_cache_key] = program_cache
-
-            global_block = program_cache.global_block()
-
-            if feed_var_name in global_block.vars:
-                feed_var = global_block.var(feed_var_name)
-            else:
-                feed_var = global_block.create_var(
-                    name=feed_var_name,
-                    type=core.VarDesc.VarType.FEED_MINIBATCH,
-                    persistable=True)
-
-            if fetch_var_name in global_block.vars:
-                fetch_var = global_block.var(fetch_var_name)
-            else:
-                fetch_var = global_block.create_var(
-                    name=fetch_var_name,
-                    type=core.VarDesc.VarType.FETCH_LIST,
-                    persistable=True)
-
-            # prepend feed operators
-            if not has_feed_operators(global_block, feed, feed_var_name):
-                for i, name in enumerate(feed):
-                    out = global_block.var(name)
-                    global_block.prepend_op(
-                        type='feed',
-                        inputs={'X': [feed_var]},
-                        outputs={'Out': [out]},
-                        attrs={'col': i})
-
-            # append fetch_operators
-            if not has_fetch_operators(global_block, fetch_list,
-                                       fetch_var_name):
-                for i, var in enumerate(fetch_list):
-                    assert isinstance(var, Variable) or isinstance(var, str), (
-                        "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
-                    global_block.append_op(
-                        type='fetch',
-                        inputs={'X': [var]},
-                        outputs={'Out': [fetch_var]},
-                        attrs={'col': i})
-
-        # feed var to framework
-        for op in program_cache.global_block().ops:
-            if op.desc.type() == 'feed':
-                feed_target_name = op.desc.output('Out')[0]
-                cur_feed = feed[feed_target_name]
-                if not isinstance(cur_feed, core.LoDTensor):
-                    cur_feed = self.aslodtensor(cur_feed)
-                idx = op.desc.attr('col')
-                core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
-            else:
-                break
-
-        self.executor.run(program_cache.desc, scope, 0, True, True)
-        outs = [
-            core.get_fetch_variable(scope, fetch_var_name, i)
-            for i in xrange(len(fetch_list))
-        ]
+            self.program_caches.pop(cache_key, None)
+            program = self._add_feed_fetch_ops(
+                program=program,
+                feed=feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name)
+
+        self._feed_data(program, feed, feed_var_name, scope)
+        self.executor.run(program.desc, scope, 0, True, True)
+        outs = self._fetch_data(fetch_list, fetch_var_name, scope)
         if return_numpy:
             outs = as_numpy(outs)
         return outs
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 70ecffd910a46570b5a8e576d88039fa5e22e726..3e78788f470556d2196b5104f69a0a3285543ec4 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -918,6 +918,24 @@ class Block(object):
                 name=v.name)
             self.vars[new_p.name] = new_p
 
+    def clone_variable(self, var):
+        """
+        Clone a variable into current block.
+        Args:
+            var: the variable to be cloned.
+
+        Returns:
+            The new  variable cloned from 'var' in current block.
+        """
+        assert isinstance(var, Variable)
+        return self.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=True)
+
 
 class Program(object):
     def __init__(self):
@@ -960,14 +978,14 @@ class Program(object):
         """Clone the Program object
 
         Set for_test to False when we want to clone the program for training.
-        Set for_test to True when we want to clone the program for testing.         
+        Set for_test to True when we want to clone the program for testing.
 
         Args:
             for_test(bool): Some operators, such as batch_norm and drop_out ops,
                 behave differently in training and testing. If for_test is True,
                 the is_test attributes in these operators will be set to True for
-                testing purposes, otherwise, they remain unchanged.  
-                
+                testing purposes, otherwise, they remain unchanged.
+
         Returns(Program):
             The cloned Program object.
         """
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index da7e74c901e1f5be709c5f9d73f048bfda0c5549..d771837fc545167f7c32fcf914dd1c3c3ae64fb3 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -399,7 +399,12 @@ class LayerHelper(object):
         if isinstance(act, basestring):
             act = {'type': act}
         tmp = self.create_tmp_variable(dtype=input_var.dtype)
+
+        if 'use_mkldnn' in self.kwargs:
+            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
         act_type = act.pop('type')
+        if 'use_mkldnn' in self.kwargs:
+            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
         self.append_op(
             type=act_type,
             inputs={"X": [input_var]},
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 02cd0a05a11d8d1d52d42c2b62799f1093d0abc2..1bb1aa30ee1019c6f80eb64b6dc20459e7a3073b 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -16,7 +16,7 @@ import contextlib
 from layer_function_generator import autodoc
 from tensor import assign, fill_constant
 from .. import core
-from ..framework import Program, Variable, Operator, Block
+from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ops import logical_and, logical_not, logical_or
 
@@ -29,7 +29,6 @@ __all__ = [
     'WhileGuard',
     'While',
     'Switch',
-    'Select',
     'lod_rank_table',
     'max_sequence_len',
     'topk',
@@ -1212,186 +1211,6 @@ class Switch(object):
         return True
 
 
-class SelectCase(object):
-    DEFAULT = 0
-    SEND = 1
-    RECEIVE = 2
-
-    def __init__(self,
-                 case_idx,
-                 case_to_execute,
-                 channel_action_fn=None,
-                 channel=None,
-                 value=None):
-        self.helper = LayerHelper('conditional_block')
-        self.main_program = self.helper.main_program
-        self.is_scalar_condition = True
-
-        self.case_to_execute = case_to_execute
-        self.idx = case_idx
-
-        # Since we aren't going to use the `channel_send` or `channel_recv`
-        # functions directly, we just need to capture the name.
-        self.action = (self.SEND
-                       if channel_action_fn.__name__ == ('channel_send') else
-                       self.RECEIVE) if channel_action_fn else (self.DEFAULT)
-        self.value = value
-        self.channel = channel
-
-    def __enter__(self):
-        self.block = self.main_program.create_block()
-
-    def construct_op(self):
-        main_program = self.helper.main_program
-        cases_block = main_program.current_block()
-
-        inner_outputs = set()
-        input_set = set()
-        params = set()
-
-        for op in self.block.ops:
-            # Iterate over all operators, get all the inputs
-            # and add as input to the SelectCase operator.
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        input_set.add(in_var_name)
-
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-
-        param_list = [
-            cases_block.var(each_name) for each_name in params
-            if each_name not in input_set
-        ]
-
-        # Iterate over all operators, get all the outputs
-        # add to the output list of SelectCase operator only if
-        # they exist in the parent block.
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            if inner_out_name in cases_block.vars:
-                out_vars.append(cases_block.var(inner_out_name))
-
-        # First, create an op that will determine whether or not this is the
-        # conditional variable to execute.
-        should_execute_block = equal(
-            fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
-            self.case_to_execute)
-
-        step_scope = cases_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        cases_block.append_op(
-            type='conditional_block',
-            inputs={'X': [should_execute_block],
-                    'Params': param_list},
-            outputs={'Out': out_vars,
-                     'Scope': [step_scope]},
-            attrs={
-                'sub_block': self.block,
-                'is_scalar_condition': self.is_scalar_condition
-            })
-
-        return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
-                                if self.channel else '', self.value.name
-                                if self.value else '')
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.main_program.rollback()
-        if exc_type is not None:
-            return False  # re-raise exception
-        return True
-
-
-class Select(BlockGuard):
-    def __init__(self, name=None):
-        self.helper = LayerHelper('select', name=name)
-        self.cases = []
-
-        super(Select, self).__init__(self.helper.main_program)
-        self.case_to_execute = fill_constant(
-            shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
-
-    def __enter__(self):
-        super(Select, self).__enter__()
-        return self
-
-    def case(self, channel_action_fn, channel, value):
-        """Create a new block for this condition.
-        """
-        select_case = SelectCase(
-            len(self.cases), self.case_to_execute, channel_action_fn, channel,
-            value)
-
-        self.cases.append(select_case)
-
-        return select_case
-
-    def default(self):
-        """Create a default case block for this condition.
-        """
-        default_case = SelectCase(len(self.cases), self.case_to_execute)
-
-        self.cases.append(default_case)
-
-        return default_case
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-
-        # Create a select op and another block to wrap its
-        # case blocks.
-        select_block = self.helper.main_program.current_block()
-        parent_block = self.helper.main_program.block(select_block.parent_idx)
-
-        # Construct each case op, inside the newly created select block.
-        serialized_cases = []
-        for case in self.cases:
-            serialized_cases.append(case.construct_op())
-
-        intermediate = set()
-        params = set()
-
-        for case_block in select_block.ops:
-            if case_block.attrs and 'sub_block' in case_block.attrs:
-                for each_op in case_block.attrs['sub_block'].ops:
-                    assert isinstance(each_op, Operator)
-                    for iname in each_op.input_names:
-                        for in_var_name in each_op.input(iname):
-                            if in_var_name not in intermediate:
-                                params.add(in_var_name)
-
-                    for oname in each_op.output_names:
-                        for out_var_name in each_op.output(oname):
-                            intermediate.add(out_var_name)
-
-        # TODO(varunarora): Figure out if defining output is needed.
-        out_list = [
-            parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name in intermediate
-        ]
-
-        X = [select_block.var_recursive(x_name) for x_name in params]
-
-        # Needs to be used by `equal` inside the cases block.
-        X.append(self.case_to_execute)
-
-        # Construct the select op.
-        parent_block.append_op(
-            type='select',
-            inputs={'X': X,
-                    'case_to_execute': self.case_to_execute},
-            attrs={'sub_block': select_block,
-                   'cases': serialized_cases},
-            outputs={})
-
-        return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
-
-
 class IfElseBlockGuard(object):
     def __init__(self, is_true, ifelse):
         if not isinstance(ifelse, IfElse):
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index a889ab6bdc6ac9494ef992a97292b7a2536c41c4..cd519e1ee082d27ccadc6247c149701fac31e812 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -129,13 +129,11 @@ def detection_output(loc,
         prior_box_var=prior_box_var,
         target_box=loc,
         code_type='decode_center_size')
-
     old_shape = scores.shape
     scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
     scores = nn.softmax(input=scores)
     scores = ops.reshape(x=scores, shape=old_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
-
     nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
     helper.append_op(
         type="multiclass_nms",
@@ -475,6 +473,7 @@ def ssd_loss(location,
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
     gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label.stop_gradient = True
     target_label, _ = target_assign(
         gt_label, matched_indices, mismatch_value=background_label)
     # 2.2. Compute confidence loss.
@@ -482,10 +481,12 @@ def ssd_loss(location,
     confidence = __reshape_to_2d(confidence)
     target_label = tensor.cast(x=target_label, dtype='int64')
     target_label = __reshape_to_2d(target_label)
+    target_label.stop_gradient = True
     conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
 
     # 3. Mining hard examples
     conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss.stop_gradient = True
     neg_indices = helper.create_tmp_variable(dtype='int32')
     dtype = matched_indices.dtype
     updated_matched_indices = helper.create_tmp_variable(dtype=dtype)
@@ -695,6 +696,8 @@ def multi_box_head(inputs,
             outputs={"Boxes": box,
                      "Variances": var},
             attrs=attrs, )
+        box.stop_gradient = True
+        var.stop_gradient = True
         return box, var
 
     def _reshape_with_axis_(input, axis=1):
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 9c91f395e7c9d7ca76c1a5cc310bc3bbc06daec9..bc5e291ad811315ddc9d101853d69c7f5ab5082d 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -21,7 +21,8 @@ from ..executor import global_scope
 
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'read_file', 'create_shuffle_reader', 'create_double_buffer_reader'
+    'open_files', 'read_file', 'create_shuffle_reader',
+    'create_double_buffer_reader', 'create_multi_pass_reader'
 ]
 
 
@@ -287,6 +288,36 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
                              startup_var)
 
 
+def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    var_name = unique_name('multiple_reader')
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=var_name)
+    startup_blk.append_op(
+        type='open_files',
+        outputs={'Out': [startup_var]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'ranks': ranks,
+            'file_names': filenames,
+            'thread_num': thread_num
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+    return _copy_reader_var_(default_main_program().current_block(),
+                             startup_var)
+
+
 def __create_decorated_reader__(op_type, reader, attrs):
     var_name = unique_name(op_type)
     startup_blk = default_startup_program().current_block()
@@ -314,6 +345,11 @@ def create_double_buffer_reader(reader, place=None):
                                        attrs)
 
 
+def create_multi_pass_reader(reader, pass_num):
+    return __create_decorated_reader__('create_multi_pass_reader', reader,
+                                       {'pass_num': int(pass_num)})
+
+
 def read_file(file_obj):
     helper = LayerHelper('read_file')
     out = [
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bf161d6618b10da66f25d3f11300a4a2b10b875a..679de6ce2aa67abe1322702fcb371eded0130698 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -73,6 +73,7 @@ __all__ = [
     'smooth_l1',
     'one_hot',
     'autoincreased_step_counter',
+    'lod_reset',
 ]
 
 
@@ -81,6 +82,7 @@ def fc(input,
        num_flatten_dims=1,
        param_attr=None,
        bias_attr=None,
+       use_mkldnn=False,
        act=None,
        name=None):
     """
@@ -162,8 +164,11 @@ def fc(input,
             inputs={"X": input_var,
                     "Y": w},
             outputs={"Out": tmp},
-            attrs={"x_num_col_dims": num_flatten_dims,
-                   "y_num_col_dims": 1})
+            attrs={
+                "x_num_col_dims": num_flatten_dims,
+                "y_num_col_dims": 1,
+                'use_mkldnn': use_mkldnn
+            })
         mul_results.append(tmp)
 
     # sum
@@ -1116,12 +1121,14 @@ def conv2d(input,
            filter_size,
            stride=1,
            padding=0,
+           dilation=1,
            groups=None,
            param_attr=None,
            bias_attr=None,
            use_cudnn=True,
            use_mkldnn=False,
-           act=None):
+           act=None,
+           name=None):
     """
     **Convlution2D Layer**
 
@@ -1182,6 +1189,9 @@ def conv2d(input,
        padding(int|tuple): The padding size. If padding is a tuple, it must
            contain two integers, (padding_H, padding_W). Otherwise, the
            padding_H = padding_W = padding. Default: padding = 0.
+       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+           contain two integers, (dilation_H, dilation_W). Otherwise, the
+           dilation_H = dilation_W = dilation. Default: dilation = 1.
        groups(int): The groups number of the Conv2d Layer. According to grouped
            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
            the first half of the filters is only connected to the first half
@@ -1192,6 +1202,8 @@ def conv2d(input,
        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. Default: True
        act(str): Activation type. Default: None
+       name(str|None): A name for this layer(optional). If set None, the layer
+           will be named automatically.
 
     Returns:
         Variable: The tensor variable storing the convolution and \
@@ -1232,6 +1244,7 @@ def conv2d(input,
     filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
     stride = utils.convert_to_list(stride, 2, 'stride')
     padding = utils.convert_to_list(padding, 2, 'padding')
+    dilation = utils.convert_to_list(dilation, 2, 'dilation')
 
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
@@ -1261,6 +1274,7 @@ def conv2d(input,
         attrs={
             'strides': stride,
             'paddings': padding,
+            'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
             'use_mkldnn': use_mkldnn
@@ -1669,7 +1683,9 @@ def conv2d_transpose(input,
                      stride=1,
                      dilation=1,
                      param_attr=None,
+                     bias_attr=None,
                      use_cudnn=True,
+                     act=None,
                      name=None):
     """
     **Convlution2D transpose layer**
@@ -1738,8 +1754,10 @@ def conv2d_transpose(input,
            dilation_H = dilation_W = dilation. Default: dilation = 1.
        param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
                               Default: None
+       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. Default: True
+       act(str): Activation type. Default: None
        name(str|None): A name for this layer(optional). If set None, the layer
            will be named automatically.
 
@@ -1792,12 +1810,12 @@ def conv2d_transpose(input,
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
-    out = helper.create_tmp_variable(dtype=input.dtype)
+    pre_bias = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
         type='conv2d_transpose',
         inputs={'Input': [input],
                 'Filter': [img_filter]},
-        outputs={'Output': out},
+        outputs={'Output': pre_bias},
         attrs={
             'strides': stride,
             'paddings': padding,
@@ -1805,55 +1823,57 @@ def conv2d_transpose(input,
             'use_cudnn': use_cudnn
         })
 
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+    out = helper.append_activation(pre_act)
     return out
 
 
-def sequence_expand(x, y, name=None):
+def sequence_expand(x, y, ref_level=-1, name=None):
     """Sequence Expand Layer. This layer will expand the input variable **x**
-    according to LoD information of **y**. And the following examples will
-    explain how sequence_expand works:
+    according to specified level lod of **y**. Please note that lod level of
+    **x** is at most 1 and rank of **x** is at least 2. When rank of **x**
+    is greater than 2, then it would be viewed as a 2-D tensor.
+    Following examples will explain how sequence_expand works:
 
     .. code-block:: text
 
         * Case 1
             x is a LoDTensor:
-                x.lod = [[0,       2, 3],
-                         [0, 1,    3, 4]]
-                x.data = [a, b, c, d]
+                x.lod  = [[0,   2,        4]]
+                x.data = [[a], [b], [c], [d]]
                 x.dims = [4, 1]
 
             y is a LoDTensor:
                 y.lod = [[0,    2,    4],
                          [0, 3, 6, 7, 8]]
 
-            with condition len(y.lod[-1]) - 1 == x.dims[0]
+            ref_level: 0
 
-            then output is a 2-level LoDTensor:
-                out.lod = [[0,                2,    4],
-                           [0,       3,       6, 7, 8]]
-                out.data = [a, a, a, b, b, b, c, d]
+            then output is a 1-level LoDTensor:
+                out.lod =  [[0,   2,        4,        6,        8]]
+                out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
                 out.dims = [8, 1]
 
         * Case 2
             x is a Tensor:
-                x.data = [a, b, c]
+                x.data = [[a], [b], [c]]
                 x.dims = [3, 1]
 
             y is a LoDTensor:
-                y.lod = [[0, 2, 3, 6]]
+                y.lod = [[0, 2, 2, 5]]
 
-            with condition len(y.lod[-1]) - 1 == x.dims[0]
-
-            then output is a 1-level LoDTensor:
-                out.lod = [[0,    2, 3,      6]]
-                out.data = [a, a, b, c, c, c]
-                out.dims = [6, 1]
+            ref_level: -1
 
+            then output is a Tensor:
+                out.data = [[a], [a], [c], [c], [c]]
+                out.dims = [5, 1]
     Args:
         x (Variable): The input variable which is a Tensor or LoDTensor.
         y (Variable): The input variable which is a LoDTensor.
+        ref_level (int): Lod level of `y` to be referred by `x`. If set to -1,
+                         refer the last level of lod.
         name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
+                        will be named automatically.
 
     Returns:
         Variable: The expanded variable which is a LoDTensor.
@@ -1864,14 +1884,17 @@ def sequence_expand(x, y, name=None):
             x = fluid.layers.data(name='x', shape=[10], dtype='float32')
             y = fluid.layers.data(name='y', shape=[10, 20],
                              dtype='float32', lod_level=1)
-            out = layers.sequence_expand(x=x, y=y)
+            out = layers.sequence_expand(x=x, y=y, ref_level=0)
     """
     helper = LayerHelper('sequence_expand', input=x, **locals())
     dtype = helper.input_dtype()
     tmp = helper.create_tmp_variable(dtype)
     helper.append_op(
-        type='sequence_expand', inputs={'X': x,
-                                        'Y': y}, outputs={'Out': tmp})
+        type='sequence_expand',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': tmp},
+        attrs={'ref_level': ref_level})
     return tmp
 
 
@@ -2225,7 +2248,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
         keep_dim (bool|False): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set None, the 
+        name(str|None): A name for this layer(optional). If set None, the
             layer will be named automatically.
 
     Returns:
@@ -2241,7 +2264,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_prod(x)  # [0.0002268]
             fluid.layers.reduce_prod(x, dim=0)  # [0.02, 0.06, 0.3, 0.63]
             fluid.layers.reduce_prod(x, dim=-1)  # [0.027, 0.0084]
-            fluid.layers.reduce_prod(x, dim=1, 
+            fluid.layers.reduce_prod(x, dim=1,
                                      keep_dim=True)  # [[0.027], [0.0084]]
     """
     helper = LayerHelper('reduce_prod', **locals())
@@ -3292,3 +3315,98 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         counter.stop_gradient = True
 
     return counter
+
+
+def lod_reset(x, y=None, target_lod=None):
+    """
+    LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or
+    **target_lod**. When **y** provided, **y.lod** would be considered as target
+    LoD first, otherwise **y.data** would be considered as target LoD. If **y**
+    is not provided, target LoD should be specified by **target_lod**.
+    If target LoD is specified by **Y.data** or **target_lod**, only one level
+    LoD is supported.
+
+    .. code-block:: text
+
+        * Example 1:
+
+            Given a 1-level LoDTensor x:
+                x.lod =  [[ 0,     2,                   5      6 ]]
+                x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                x.dims = [6, 1]
+
+            target_lod: [0, 4, 6]
+
+            then we get a 1-level LoDTensor:
+                out.lod =  [[ 0,                   4,            6 ]]
+                out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                out.dims = [6, 1]
+
+        * Example 2:
+
+            Given a 1-level LoDTensor x:
+                x.lod =  [[ 0,     2,                   5      6 ]]
+                x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                x.dims = [6, 1]
+
+            y is a Tensor:
+                y.data = [[0, 2, 6]]
+                y.dims = [1, 3]
+
+            then we get a 1-level LoDTensor:
+                out.lod =  [[ 0,     2,                          6 ]]
+                out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                out.dims = [6, 1]
+
+        * Example 3:
+
+            Given a 1-level LoDTensor x:
+                x.lod =  [[ 0,      2,                   5     6 ]]
+                x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                x.dims = [6, 1]
+
+            y is a 2-level LoDTensor:
+                y.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+                y.data = [[1.1], [2.1], [3.1], [4.1], [5.1], [6.1]]
+                y.dims = [6, 1]
+
+            then we get a 2-level LoDTensor:
+                out.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+                out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                out.dims = [6, 1]
+
+    Args:
+        x (Variable): Input variable which could be a Tensor or LodTensor.
+        y (Variable|None): If provided, output's LoD would be derived from y.
+        target_lod (list|tuple|None): One level LoD which should be considered
+                                      as target LoD when y not provided.
+
+    Returns:
+        Variable: Output variable with LoD specified by this operator.
+
+    Raises:
+        ValueError: If y and target_lod are both None.
+
+    Examples:
+        .. code-block:: python
+
+            x = layers.data(name='x', shape=[10])
+            y = layers.data(name='y', shape=[10, 20], lod_level=2)
+            out = layers.lod_reset(x=x, y=y)
+    """
+    helper = LayerHelper("lod_reset", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    if y is not None:
+        helper.append_op(
+            type="lod_reset", inputs={'X': x,
+                                      'Y': y}, outputs={'Out': out})
+    elif target_lod is not None:
+        helper.append_op(
+            type="lod_reset",
+            inputs={'X': x},
+            attrs={'target_lod': target_lod},
+            outputs={'Out': out})
+    else:
+        raise ValueError("y and target_lod should not be both None.")
+
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index d7bad221c5fa7b18137bf317125195267437a644..f5c6b47d243dcf4ba985cfb41fc23b44d3ed809f 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -69,6 +69,7 @@ __all__ = [
     'gaussian_random_batch_size_like',
     'cumsum',
     'scatter',
+    'sum',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 8b8621469d856e63dfd9685d7cd3d4c1d2ada1ce..180575c35dc6e115e11cccf9fff9fb2d3cd7e9a6 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from collections import defaultdict
-
+from paddle.fluid.framework import Program
 import framework
 import layers
 from backward import append_backward
@@ -23,8 +23,12 @@ from initializer import Constant
 from layer_helper import LayerHelper
 from regularizer import append_regularization_ops
 from clip import append_gradient_clip_ops, error_clip_callback
+from contextlib import contextmanager
 
-__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']
+__all__ = [
+    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
+    'Adadelta', 'ModelAverage'
+]
 
 
 class Optimizer(object):
@@ -119,7 +123,12 @@ class Optimizer(object):
         """
         pass
 
-    def _add_accumulator(self, name, param, dtype=None, fill_value=0.0):
+    def _add_accumulator(self,
+                         name,
+                         param,
+                         dtype=None,
+                         fill_value=0.0,
+                         shape=None):
         """Utility function to add an accumulator for a parameter
 
         Args:
@@ -133,17 +142,19 @@ class Optimizer(object):
                 param.name in self._accumulators[name]):
             raise Exception("Accumulator {} already exists for parameter {}".
                             format(name, param.name))
-
+        if shape == None:
+            shape = param.shape
         assert isinstance(self.helper, LayerHelper)
         var = self.helper.create_global_variable(
             name=unique_name.generate(name),
             persistable=True,
             dtype=dtype or param.dtype,
             type=param.type,
-            shape=param.shape)
+            shape=shape)
         self.helper.set_variable_initializer(
             var, initializer=Constant(value=float(fill_value)))
         self._accumulators[name][param.name] = var
+        return var
 
     def _get_accumulator(self, name, param):
         """Utility function to fetch an accumulator for a parameter
@@ -580,6 +591,205 @@ class DecayedAdagradOptimizer(Optimizer):
         return decayed_adagrad_op
 
 
+class AdadeltaOptimizer(Optimizer):
+    """
+    **Adadelta Optimizer**
+    Simple Adadelta optimizer with average squared grad state and
+    average squared update state.
+    The details of adadelta please refer to this
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
+    <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
+
+    ..  math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
+        learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
+                          E(g_t^2) + \\epsilon ) ) \\\\
+        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
+
+    Args:
+        learning_rate(float): global leraning rate
+        rho(float): rho in equation
+        epsilon(float): epsilon in equation
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adadelta(
+                learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
+            _, params_grads = optimizer.minimize(cost)
+    """
+
+    _avg_squared_grad_acc_str = "_avg_squared_grad"
+    _avg_squared_update_acc_str = "_avg_squared_update"
+
+    def __init__(self, learning_rate, epsilon=1.0e-6, rho=0.95, **kwargs):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        super(AdadeltaOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        self.type = "adadelta"
+        self._epsilon = epsilon
+        self._rho = rho
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._avg_squared_grad_acc_str, p)
+            self._add_accumulator(self._avg_squared_update_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        avg_squared_grad_acc = self._get_accumulator(
+            self._avg_squared_grad_acc_str, param_and_grad[0])
+        avg_squared_update_acc = self._get_accumulator(
+            self._avg_squared_update_acc_str, param_and_grad[0])
+
+        # Create the adadelta optimizer op
+        adadelta_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "AvgSquaredGrad": avg_squared_grad_acc,
+                "AvgSquaredUpdate": avg_squared_update_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "AvgSquaredGradOut": avg_squared_grad_acc,
+                "AvgSquaredUpdateOut": avg_squared_update_acc
+            },
+            attrs={"epsilon": self._epsilon,
+                   "rho": self._rho})
+
+        return adadelta_op
+
+
+class RMSPropOptimizer(Optimizer):
+    """
+    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
+    rate method. The original slides proposed RMSProp: Slide 29 of
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
+
+    The original equation is as follows:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+
+        w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+
+    The first equation calculates moving average of the squared gradient for
+    each weight. Then dividing the gradient by :math: `sqrt{v(w,t)}`.
+
+    In some cases, adding a momentum term :math: `\\beta` is beneficial.
+    In our implementation, Nesterov momentum is used:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    where, :math: `\\rho` is a hyperparameter and typical values are 0.9, 0.95
+    and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
+    smoothing term to avoid division by zero, usually set somewhere in range
+    from 1e-4 to 1e-8.
+
+
+    Args:
+        learning_rate(float): global leraning rate.
+        rho(float): rho is :math: `\\rho` in equation, set 0.95 by default.
+        epsilon(float): :math: `\\epsilon` in equation is smoothing term to
+            avoid division by zero, set 1e-6 by default.
+        momentum(float): :math: `\\beta` in equation is the momentum term,
+            set 0.0 by default.
+
+    Raises:
+        ValueError: If learning_rate, rho, epsilon, momentum are None.
+
+    Examples:
+          .. code-block:: python
+
+              optimizer = fluid.optimizer.RMSProp(0.0001)
+              _, params_grads = optimizer.minimize(cost)
+    """
+
+    _momentum_acc_str = "momentum"
+    _mean_square_acc_str = "mean_square"
+
+    def __init__(self,
+                 learning_rate,
+                 rho=0.95,
+                 epsilon=1.0e-6,
+                 momentum=0.0,
+                 **kwargs):
+        super(RMSPropOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if momentum is None:
+            raise ValueError("momentum is not set.")
+
+        self.type = "rmsprop"
+        self._rho = rho
+        self._epsilon = epsilon
+        self._momentum = momentum
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._momentum_acc_str, p)
+            self._add_accumulator(self._mean_square_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        momentum_acc = self._get_accumulator(self._momentum_acc_str,
+                                             param_and_grad[0])
+        mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
+                                                param_and_grad[0])
+        rmsprop_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": momentum_acc,
+                "MeanSquare": mean_square_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "MomentOut": momentum_acc,
+                "MeanSquareOut": mean_square_acc
+            },
+            attrs={
+                "epsilon": self._epsilon,
+                "decay": self._rho,
+                "momentum": self._momentum
+            })
+
+        return rmsprop_op
+
+
 # We short the class name, since users will use the optimizer with the package
 # name. The sample code:
 #
@@ -594,3 +804,145 @@ Adagrad = AdagradOptimizer
 Adam = AdamOptimizer
 Adamax = AdamaxOptimizer
 DecayedAdagrad = DecayedAdagradOptimizer
+Adadelta = AdadeltaOptimizer
+RMSProp = RMSPropOptimizer
+
+
+class ModelAverage(Optimizer):
+    """Accumulate the average of parameters whtin sliding window. The average
+    result will be saved in temporary variables which can be applied to
+    parameter variables of current model by calling 'apply()' method. And the
+    'restore()' method is used to restored the parameter values of current model.
+
+    The size of average window is determined by average_window_rate,
+    min_average_window, max_average_window and current update times.
+
+    Args:
+        params_grads: A list of parameter-grad variable pairs.
+        average_window_rate: The rate of average window.
+        min_average_window: The minimum size of average window.
+        max_average_window: The maximum size of average window.
+
+    Examples:
+        ...
+        optimizer = fluid.optimizer.Momentum()
+        _, params_grads = optimizer.minimize(cost)
+        model_average = fluid.optimizer.ModelAverage(params_grads, 0.15,
+                                                min_average_window=10000,
+                                                max_average_window=20000)
+        for pass_id in range(args.pass_num):
+            for data in train_reader():
+                exe.run(fluid.default_main_program()...)
+
+            with model_average.apply(exe):
+                for data in test_reader():
+                    exe.run(inference_program...)
+    """
+
+    def __init__(self,
+                 params_grads,
+                 average_window_rate,
+                 min_average_window=10000,
+                 max_average_window=10000,
+                 **kwargs):
+        super(ModelAverage, self).__init__(0.0, **kwargs)
+        self.average_window = average_window_rate
+        self.min_average_window = min_average_window
+        self.max_average_window = max_average_window
+        self.params_grads = params_grads
+        for param, grad in self.params_grads:
+            if grad is not None:
+                self._append_average_accumulate_op(param)
+
+        self.apply_program = Program()
+        block = self.apply_program.global_block()
+        with program_guard(main_program=self.apply_program):
+            for param_grad in self.params_grads:
+                if param_grad[1] is not None:
+                    self._add_average_apply_op(block, param_grad)
+
+        self.restore_program = Program()
+        block = self.restore_program.global_block()
+        with program_guard(main_program=self.restore_program):
+            for param_grad in self.params_grads:
+                if param_grad[1] is not None:
+                    self._add_average_restore_op(block, param_grad)
+
+    def _add_average_apply_op(self, block, param_grad):
+        param = block.clone_variable(param_grad[0])
+        grad = block.clone_variable(param_grad[1])
+        sum_1 = block.clone_variable(self._get_accumulator('sum_1', param))
+        sum_2 = block.clone_variable(self._get_accumulator('sum_2', param))
+        sum_3 = block.clone_variable(self._get_accumulator('sum_3', param))
+        num_accumulates = block.clone_variable(
+            self._get_accumulator('num_accumulates', param))
+        old_num_accumulates = block.clone_variable(
+            self._get_accumulator('old_num_accumulates', param))
+        num_updates = block.clone_variable(
+            self._get_accumulator('num_updates', param))
+        # backup param value to grad
+        layers.assign(input=param, output=grad)
+        # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
+        tmp = layers.sum(x=[num_accumulates, old_num_accumulates])
+        sum = layers.sum(x=[sum_1, sum_2, sum_3])
+        tmp = layers.cast(x=tmp, dtype='float32')
+        sum = layers.cast(x=sum, dtype='float32')
+        layers.elementwise_div(x=sum, y=tmp, out=param)
+
+    def _add_average_restore_op(self, block, param_grad):
+        param = block.clone_variable(param_grad[0])
+        grad = block.clone_variable(param_grad[1])
+        layers.assign(input=grad, output=param)
+
+    def _append_average_accumulate_op(self, param):
+        self.helper = LayerHelper("average_accumulate")
+        sum_1 = self._add_accumulator('sum_1', param)
+        sum_2 = self._add_accumulator('sum_2', param)
+        sum_3 = self._add_accumulator('sum_3', param)
+        num_accumulates = self._add_accumulator(
+            'num_accumulates', param, dtype='int64', shape=[1])
+        old_num_accumulates = self._add_accumulator(
+            'old_num_accumulates', param, dtype='int64', shape=[1])
+        num_updates = self._add_accumulator(
+            'num_updates', param, dtype='int64', shape=[1])
+
+        self.helper.append_op(
+            type='average_accumulates',
+            inputs={
+                "param": param,
+                "in_sum_1": sum_1,
+                "in_sum_2": sum_2,
+                "in_sum_3": sum_3,
+                "in_num_accumulates": num_accumulates,
+                "in_old_num_accumulates": old_num_accumulates,
+                "in_num_updates": num_updates
+            },
+            outputs={
+                "out_sum_1": sum_1,
+                "out_sum_2": sum_2,
+                "out_sum_3": sum_3,
+                "out_num_accumulates": num_accumulates,
+                "out_old_num_accumulates": old_num_accumulates,
+                "out_num_updates": num_updates,
+            },
+            attrs={
+                "average_window": self.average_window,
+                "min_average_window": self.min_average_window,
+                "max_average_window": self.max_average_window,
+            })
+
+    @contextmanager
+    def apply(self, executor, need_restore=True):
+        """Apply average values to parameters of current model.
+        """
+        executor.run(self.apply_program)
+        try:
+            yield
+        finally:
+            if need_restore:
+                self.restore(executor)
+
+    def restore(self, executor):
+        """Restore parameter values of current model.
+        """
+        executor.run(self.restore_program)
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 029db7d2dd4b7def8cea374e3f2ed31226f2bc18..604c6f9ab36c2332223d1ba943d67113922615b3 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -44,6 +44,11 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
     """
     params_and_grads = []
     for param, grad in parameters_and_grads:
+        # If no gradient then we don't need to do anything
+        if grad is None:
+            params_and_grads.append((param, grad))
+            continue
+
         regularization_term = None
         if param.regularizer is not None:
             # Add variable for regularization term in grad block
@@ -51,9 +56,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
         elif regularization is not None:
             regularization_term = regularization(param, grad, grad.block)
 
-        # If no gradient or no regularization specified,
-        # then we don't need to do anything
-        if grad is None or regularization_term is None:
+        # If no regularization specified, then we don't need to do anything
+        if regularization_term is None:
             params_and_grads.append((param, grad))
             continue
 
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index fa38bd3762423497b82c3b421b3a1db4cd87525b..3a1a0859ecfd4ac5337e2112f8b22e32d8474f22 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -118,12 +118,12 @@ def decoder_decode(context, is_sparse):
             is_sparse=is_sparse)
 
         # use rnn unit to update rnn
-        current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
+        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
                               size=decoder_size,
                               act='tanh')
-
+        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
         # use score to do beam search
-        current_score = pd.fc(input=current_state,
+        current_score = pd.fc(input=current_state_with_lod,
                               size=target_dict_dim,
                               act='softmax')
         topk_scores, topk_indices = pd.topk(current_score, k=50)
diff --git a/python/paddle/fluid/tests/unittests/.gitignore b/python/paddle/fluid/tests/unittests/.gitignore
index 6b3fc2a83c649c28d21c9a8a0b35c2f2fa04f269..ad02bdecf436bba925e2e3b7efb20c878df70dfd 100644
--- a/python/paddle/fluid/tests/unittests/.gitignore
+++ b/python/paddle/fluid/tests/unittests/.gitignore
@@ -1 +1,4 @@
 mnist.recordio
+mnist_0.recordio
+mnist_1.recordio
+mnist_2.recordio
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 6a42f763a6c436f5d33569dc65f711c2930a8b2e..8393f7827b1c7d361ebea72f2cfc6033268772f0 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -483,9 +483,9 @@ class OpTest(unittest.TestCase):
             input: input numpy array
 
         Returns:
-            input: if the dtype of input is np.float16, its dtype will be
-                changed to np.uint16 so that the internal memory will be 
-                reinterpreted input as of dtype np.uint16. 
+            input: The dtype of input will be changed to np.uint16 if 
+                it is originally np.float16, such that the internal memory
+                of input will be reinterpreted as of dtype np.uint16. 
         """
         if input.dtype == np.float16:
             input.dtype = np.uint16
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index eab41ebe711bd21bdc3b34ca83ab57388cc35ba2..4a2b35322dd4b9718c83eb5ee679ada382938441 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 from scipy.special import expit
 
@@ -212,18 +213,39 @@ class TestRound(OpTest):
 class TestRelu(OpTest):
     def setUp(self):
         self.op_type = "relu"
-        x = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         # The same reason with TestAbs
         x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.maximum(self.inputs['X'], 0)}
+        out = np.maximum(x, 0)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Relu(TestRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestBRelu(OpTest):
     def setUp(self):
@@ -484,5 +506,54 @@ class TestSwish(OpTest):
         self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
 
+#--------------------test MKLDNN--------------------
+class TestMKLDNNRelu(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNRelu, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNTanh(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanh, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNSqrt(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrt, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNAbs(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbs, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.abs(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 80e6fa6df3c21aa19feb571916f11c41ccd6bb10..10aa63e18a6eeaa44e5b12f7532998dca2bc5e9f 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -31,6 +31,37 @@ def get_backward_op(scope, op, no_grad_set):
     return backward_op
 
 
+def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+
+    if data_format == "NCHW":
+        n, c, h, w = x.shape
+        mean_tile = np.reshape(mean, (1, c, 1, 1))
+        mean_tile = np.tile(mean_tile, (n, 1, h, w))
+        var_tile = np.reshape(var, (1, c, 1, 1))
+        var_tile = np.tile(var_tile, (n, 1, h, w))
+        normalized = (x - mean_tile) / np.sqrt(var_tile + epsilon)
+        scale_tile = np.reshape(scale, (1, c, 1, 1))
+        scale_tile = np.tile(scale_tile, (n, 1, h, w))
+        offset_tile = np.reshape(offset, (1, c, 1, 1))
+        offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
+        y = normalized * scale_tile + offset_tile
+    elif data_format == "NHWC":
+        normalized = (x - mean) / np.sqrt(var + epsilon)
+        y = normalized * scale + offset
+    else:
+        raise ValueError("Unknown data order.")
+
+    if len(x_shape) == 2:
+        y = np.reshape(y, x_shape)
+    return y
+
+
 def _reference_training(x, scale, offset, epsilon, data_format):
     x_shape = x.shape
     if len(x_shape) == 2:
@@ -155,11 +186,159 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
         __set_tensor__(output, data)
 
 
-class TestBatchNormOp(OpTest):
+class TestBatchNormOpInference(OpTest):
+    def setUp(self):
+        self.dtype = np.float32
+
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
-    def test_python(self):
+    def check_with_place(self, place, data_layout, dtype, shape):
+        epsilon = 0.00001
+        if len(shape) == 2:
+            x_shape = shape
+            c = x_shape[1]
+        else:
+            n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+            if data_layout == "NHWC":
+                x_shape = [n, h, w, c]
+            elif data_layout == "NCHW":
+                x_shape = [n, c, h, w]
+            else:
+                raise ValueError("Unknown data layout.")
+        scale_shape = [c]
+
+        x_val = np.random.random_sample(x_shape).astype(dtype)
+        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+
+        mean = np.zeros(scale_shape).astype(np.float32)
+        variance = np.ones(scale_shape).astype(np.float32)
+
+        y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
+                                   epsilon, data_layout).astype(dtype)
+
+        scope = core.Scope()
+
+        # create input
+        x_tensor = create_or_get_tensor(scope, "x_val",
+                                        OpTest.np_dtype_to_fluid_dtype(x_val),
+                                        place)
+        scale_tensor = create_or_get_tensor(
+            scope, "scale_val",
+            OpTest.np_dtype_to_fluid_dtype(scale_val), place)
+        bias_tensor = create_or_get_tensor(
+            scope, "bias_val", OpTest.np_dtype_to_fluid_dtype(bias_val), place)
+        mean_tensor = create_or_get_tensor(scope, "mean",
+                                           OpTest.np_dtype_to_fluid_dtype(mean),
+                                           place)
+        variance_tensor = create_or_get_tensor(
+            scope, "variance", OpTest.np_dtype_to_fluid_dtype(variance), place)
+
+        # create output
+        y_tensor = create_or_get_tensor(scope, "y_out", None, place)
+        saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
+                                                 place)
+        saved_variance_tensor = create_or_get_tensor(scope, "saved_variance",
+                                                     None, place)
+        mean_out_tensor = mean_tensor
+        variance_out_tensor = variance_tensor
+
+        batch_norm_op = Operator(
+            "batch_norm",
+            # inputs
+            X="x_val",
+            Scale="scale_val",
+            Bias="bias_val",
+            Mean="mean",
+            Variance="variance",
+            # outputs
+            Y="y_out",
+            MeanOut="mean",
+            VarianceOut="variance",
+            SavedMean="saved_mean",
+            SavedVariance="saved_variance",
+            # attrs
+            is_test=True,
+            data_layout=data_layout,
+            epsilon=epsilon)
+
+        batch_norm_op.run(scope, place)
+
+        # check inference result
+        self.__assert_close(
+            y_tensor,
+            y_out,
+            "inference output are different at " + str(place) + ", " +
+            data_layout + ", " + str(np.dtype(dtype)) +
+            str(np.array(y_tensor)) + str(y_out),
+            atol=1e-3)
+
+    def test_check_output(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            for data_format in ["NCHW", "NHWC"]:
+                self.check_with_place(place, data_format, self.dtype,
+                                      [2, 3, 4, 5])
+                self.check_with_place(place, data_format, self.dtype, [2, 3])
+
+
+class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+    def setUp(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        places = []
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                places.append(place)
+
+        for place in places:
+            for data_format in ["NCHW", "NHWC"]:
+                self.check_with_place(place, data_format, self.dtype,
+                                      [2, 3, 4, 5])
+                self.check_with_place(place, data_format, self.dtype, [2, 3])
+
+
+class TestBatchNormOpTraining(OpTest):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def test_python_testing(self):
+        data_format = "NHWC"
+        epsilon = 0.00001
+
+        n, h, w, c = 2, 3, 4, 5
+        x_shape = [n, h, w, c]
+        scale_shape = [c]
+
+        x_val = np.random.random_sample(x_shape).astype(np.float32)
+        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+
+        mean = np.zeros(scale_shape).astype(np.float32)
+        variance = np.ones(scale_shape).astype(np.float32)
+
+        y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
+                                   epsilon, "NHWC")
+
+        # running N, C, H, W case
+        # should produce the same results
+        x_shape2 = [n, c, h, w]
+        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
+        y_out2 = _reference_testing(x_val2, scale_val, bias_val, mean, variance,
+                                    epsilon, "NCHW")
+
+        # transfer (N, C, H, W) back to (N, H, W, C)
+        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
+        self.__assert_close(y_out, y_out2_trans, "inference output")
+        print 'python: NHWC, NCHW, inference checking passed'
+
+    def test_python_training(self):
         data_format = "NHWC"
         epsilon = 0.00001
         momentum = 0.9
@@ -197,7 +376,7 @@ class TestBatchNormOp(OpTest):
 
         # transfer (N, C, H, W) back to (N, H, W, C)
         y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "batch variance")
+        self.__assert_close(y_out, y_out2_trans, "batch output")
         print 'python: NHWC, NCHW, forward checking passed'
 
         # test backward now
diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
index 8fb8d03828393ccfe57c0848d79b960c641ad39a..b8d3ed3aa3eb0e47e79f46cdf681a3b9cca46036 100644
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
@@ -18,7 +18,7 @@ import numpy as np
 import paddle.fluid.core as core
 
 
-class TestCastOp(op_test.OpTest):
+class TestCastOp1(op_test.OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
@@ -36,5 +36,36 @@ class TestCastOp(op_test.OpTest):
         self.check_grad(['X'], ['Out'])
 
 
+class TestCastOp2(op_test.OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        # numpy float16 is binded to fluid float16 via uint16
+        self.inputs = {'X': ipt.astype('float16').view(np.uint16)}
+        self.outputs = {'Out': ipt.astype('float32')}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP16),
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
+class TestCastOp3(op_test.OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float16')}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.FP16)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 558f3a4dcbb8fe39c427d8b100f4488440e8b8cb..1e00d67d5480bfa77a60e1aed52cafac6e8242ca 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -20,19 +20,35 @@ from op_test import OpTest
 class TestConcatOp(OpTest):
     def setUp(self):
         self.op_type = "concat"
-        x0 = np.random.random((2, 1, 4, 5)).astype('float32')
-        x1 = np.random.random((2, 2, 4, 5)).astype('float32')
-        x2 = np.random.random((2, 3, 4, 5)).astype('float32')
-        axis = 1
-        self.inputs = {'X': [('x0', x0), ('x1', x1), ('x2', x2)]}
-        self.attrs = {'axis': axis}
-        self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)}
+        self.init_test_data()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.axis)
+        }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
         self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
+
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 1, 4, 5)).astype('float32')
+        self.x1 = np.random.random((2, 2, 4, 5)).astype('float32')
+        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.axis = 1
+
+
+class TestConcatOp2(OpTest):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.axis = 1
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 7913b98240fbd0aaa8d94911e68f61a0e416e7c8..4b6e3fb69a12095c77f343515fe3b6d1f3fccb14 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -63,12 +63,13 @@ def conv2d_forward_naive(input, filter, group, conv_param):
 
 class TestConv2dOp(OpTest):
     def setUp(self):
+        self.op_type = "conv2d"
         self.use_cudnn = False
         self.use_mkldnn = False
-        self.init_op_type()
+        self.dtype = np.float32
+        self.init_kernel_type()
         self.init_group()
         self.init_dilation()
-        self.init_data_type()
         self.init_test_case()
 
         conv2d_param = {
@@ -159,17 +160,14 @@ class TestConv2dOp(OpTest):
         f_c = self.input_size[1] / self.groups
         self.filter_size = [6, f_c, 3, 3]
 
-    def init_data_type(self):
-        self.dtype = np.float32
-
     def init_dilation(self):
         self.dilations = [1, 1]
 
     def init_group(self):
         self.groups = 1
 
-    def init_op_type(self):
-        self.op_type = "conv2d"
+    def init_kernel_type(self):
+        pass
 
 
 class TestWithPad(TestConv2dOp):
@@ -241,13 +239,13 @@ class TestWithInput1x1Filter1x1(TestConv2dOp):
 
 #----------------Conv2dCUDNN----------------
 class TestCUDNN(TestConv2dOp):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
 
 
-class TestFP16CUDNN(TestCUDNN):
-    def init_data_type(self):
+class TestFP16CUDNN(TestConv2dOp):
+    def init_kernel_type(self):
+        self.use_cudnn = True
         self.dtype = np.float16
 
     def test_check_output(self):
@@ -258,13 +256,13 @@ class TestFP16CUDNN(TestCUDNN):
 
 
 class TestCUDNNWithPad(TestWithPad):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
 
 
-class TestFP16CUDNNWithPad(TestCUDNNWithPad):
-    def init_data_type(self):
+class TestFP16CUDNNWithPad(TestWithPad):
+    def init_kernel_type(self):
+        self.use_cudnn = True
         self.dtype = np.float16
 
     def test_check_output(self):
@@ -275,13 +273,13 @@ class TestFP16CUDNNWithPad(TestCUDNNWithPad):
 
 
 class TestCUDNNWithStride(TestWithStride):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
 
 
-class TestFP16CUDNNWithStride(TestCUDNNWithStride):
-    def init_data_type(self):
+class TestFP16CUDNNWithStride(TestWithStride):
+    def init_kernel_type(self):
+        self.use_cudnn = True
         self.dtype = np.float16
 
     def test_check_output(self):
@@ -292,13 +290,13 @@ class TestFP16CUDNNWithStride(TestCUDNNWithStride):
 
 
 class TestCUDNNWithGroup(TestWithGroup):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
 
 
-class TestFP16CUDNNWithGroup(TestCUDNNWithGroup):
-    def init_data_type(self):
+class TestFP16CUDNNWithGroup(TestWithGroup):
+    def init_kernel_type(self):
+        self.use_cudnn = True
         self.dtype = np.float16
 
     def test_check_output(self):
@@ -309,13 +307,13 @@ class TestFP16CUDNNWithGroup(TestCUDNNWithGroup):
 
 
 class TestCUDNNWith1x1(TestWith1x1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
 
 
-class TestFP16CUDNNWith1x1(TestCUDNNWith1x1):
-    def init_data_type(self):
+class TestFP16CUDNNWith1x1(TestWith1x1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
         self.dtype = np.float16
 
     def test_check_output(self):
@@ -326,13 +324,13 @@ class TestFP16CUDNNWith1x1(TestCUDNNWith1x1):
 
 
 class TestCUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
 
 
-class TestFP16CUDNNWithInput1x1Filter1x1(TestCUDNNWithInput1x1Filter1x1):
-    def init_data_type(self):
+class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
         self.dtype = np.float16
 
     def test_check_output(self):
@@ -375,21 +373,18 @@ class TestDepthwiseConv2(TestConv2dOp):
 
 #----------------Conv2dMKLDNN----------------
 class TestMKLDNN(TestConv2dOp):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_mkldnn = True
-        self.op_type = "conv2d"
 
 
 class TestMKLDNNWithPad(TestWithPad):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_mkldnn = True
-        self.op_type = "conv2d"
 
 
 class TestMKLDNNWithStride(TestWithStride):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_mkldnn = True
-        self.op_type = "conv2d"
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index 60930a612c128cbf18e89711b9246d148e41ec58..eaa3435a86462236a99489749abe877648677053 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -82,5 +83,37 @@ class TestDropoutOp5(OpTest):
         self.check_output()
 
 
+class TestFP16DropoutOp(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.init_test_case()
+
+        x = np.random.random(self.input_size).astype("float16")
+        out = x * (1.0 - self.prob)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {
+            'dropout_prob': self.prob,
+            'fix_seed': self.fix_seed,
+            'is_test': True
+        }
+        self.outputs = {'Out': out}
+
+    def init_test_case(self):
+        self.input_size = [32, 64]
+        self.prob = 0.35
+        self.fix_seed = True
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda() and core.op_support_gpu("dropout"):
+            self.check_output_with_place(core.CUDAPlace(0), atol=1e-3)
+
+
+class TestFP16DropoutOp2(TestFP16DropoutOp):
+    def init_test_case(self):
+        self.input_size = [32, 64, 3]
+        self.prob = 0.75
+        self.fix_seed = False
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 5b2384e94d788342c692fcb8e33f3a2ff663ab53..1f52bd90d0d49bda6c180019e90ebd923c91439c 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -13,158 +13,243 @@
 # limitations under the License.
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
-class TestElementwiseOp(OpTest):
+class TestElementwiseAddOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_add"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_axis()
+
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
         }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+        self.attrs = {'axis': self.axis}
+        self.outputs = {'Out': self.out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
 
     def test_check_grad_ingore_x(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(
             ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
 
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
 
-class TestElementwiseAddOp_scalar(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(1).astype(np.float32)
-        }
-        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
+    def init_dtype(self):
+        pass
 
+    def init_axis(self):
+        pass
 
-class TestElementwiseAddOp_scalar2(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(1, 1).astype(np.float32)
-        }
-        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
 
+class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
+    def init_dtype(self):
+        self.dtype = np.float16
 
-class TestElementwiseAddOp_Vector(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.random((32, )).astype("float32"),
-            'Y': np.random.random((32, )).astype("float32")
-        }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
-class TestElementwiseAddOp_broadcast_0(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(2).astype(np.float32)
-        }
+class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
 
-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1)
-        }
 
+class TestFP16ElementwiseAddOp_scalar(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
 
-class TestElementwiseAddOp_broadcast_1(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(3).astype(np.float32)
-        }
 
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 1)
-        }
+class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
 
 
-class TestElementwiseAddOp_broadcast_2(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(4).astype(np.float32)
-        }
+class TestFP16ElementwiseAddOp_scalar2(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
 
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1, 4)
-        }
 
+class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((32, )).astype(self.dtype)
+        self.y = np.random.random((32, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
 
-class TestElementwiseAddOp_broadcast_3(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
-            'Y': np.random.rand(3, 4).astype(np.float32)
-        }
 
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4, 1)
-        }
+class TestFP16ElementwiseAddOp_Vector(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((32, )).astype(self.dtype)
+        self.y = np.random.random((32, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
 
 
-class TestElementwiseAddOp_broadcast_4(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
-            'Y': np.random.rand(2, 1).astype(np.float32)
-        }
+class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1)
 
-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1, 1)
-        }
+    def init_axis(self):
+        self.axis = 0
 
 
-class TestElementwiseAddOp_rowwise_add_0(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(3, 4).astype(np.float32)
-        }
+class TestFP16ElementwiseAddOp_broadcast_0(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1)
 
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4)
-        }
+    def init_axis(self):
+        self.axis = 0
 
 
-class TestElementwiseAddOp_rowwise_add_1(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 1).astype(np.float32),
-            'Y': np.random.rand(1).astype(np.float32)
-        }
+class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 1)
 
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1)
-        }
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestFP16ElementwiseAddOp_broadcast_1(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 4)
+
+
+class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 4)
+
+
+class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestFP16ElementwiseAddOp_rowwise_add_0(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
index 4958bef3ef4d101f934a2776efc21efdd24a9a4d..e1272c1d6dd7131b55ecf33fa0de0fc78a3ac5a7 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
@@ -16,7 +16,6 @@ import unittest
 
 import numpy
 import paddle.fluid.core as core
-
 from paddle.fluid.executor import Executor
 from paddle.fluid.layers import mul, data
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 90d70aa39fdc4d4d3f9062eb6a3eb0cdd014acfc..b5fd59cf3a1bea50b799c3ace8f3b9cea088b9d5 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -181,8 +181,8 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             x = layers.data(name='x', shape=[10], dtype='float32')
             y = layers.data(
-                name='y', shape=[10, 20], dtype='float32', lod_level=1)
-            self.assertIsNotNone(layers.sequence_expand(x=x, y=y))
+                name='y', shape=[10, 20], dtype='float32', lod_level=2)
+            self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1))
         print(str(program))
 
     def test_lstm_unit(self):
@@ -327,6 +327,15 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(loss)
         print(str(program))
 
+    def test_lod_reset(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10], dtype='float32')
+            y = layers.data(
+                name='y', shape=[10, 20], dtype='float32', lod_level=2)
+            print(layers.lod_reset(x=x, y=y))
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
index 3bf8230f8748dd87ec3c85b0cbd78df2e695a96b..6b6d4c824aeae319dacf224408ce96a0d9c5bb35 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
@@ -42,7 +42,7 @@ class TestLodResetOpByInput(OpTest):
         target_lod_0 = [0, 4, 7, 10]
         self.inputs = {
             'X': (x, lod),
-            'TargetLoD': np.array([target_lod_0]).astype('int32')
+            'Y': np.array([target_lod_0]).astype('int32')
         }
         self.outputs = {'Out': (x, [target_lod_0])}
 
@@ -50,7 +50,7 @@ class TestLodResetOpByInput(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+        self.check_grad(["X"], "Out", no_grad_set=set("Y"))
 
 
 class TestLodResetOpBoth(OpTest):
@@ -62,7 +62,7 @@ class TestLodResetOpBoth(OpTest):
         target_lod_0_in = [0, 4, 7, 10]
         self.inputs = {
             'X': (x, lod),
-            'TargetLoD': np.array(target_lod_0_in).astype('int32')
+            'Y': np.array(target_lod_0_in).astype('int32')
         }
         self.attrs = {'target_lod': target_lod_0_attr}
         self.outputs = {'Out': (x, [target_lod_0_in])}
@@ -71,7 +71,24 @@ class TestLodResetOpBoth(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+        self.check_grad(["X"], "Out", no_grad_set=set("Y"))
+
+
+class TestLodResetOpYIsLoDTensor(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        y = np.random.random((10, 10)).astype("float32")
+        target_lod_0 = [[0, 4, 7, 10]]
+        self.inputs = {'X': (x, lod), 'Y': (y, target_lod_0)}
+        self.outputs = {'Out': (x, target_lod_0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", no_grad_set=set("Y"))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index eaff45cbb2a58798e9d55149510bec72eea370cd..2268eafdbd08cd0d6a175d19cedd79b7b984289b 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -87,5 +87,15 @@ class TestLRNOp(OpTest):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
+class TestLRNMKLDNNOp(TestLRNOp):
+    def get_attrs(self):
+        attrs = TestLRNOp.get_attrs(self)
+        attrs['use_mkldnn'] = True
+        return attrs
+
+    def test_check_output(self):
+        self.check_output(atol=0.002)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..8add353303e3626bbce68199a100306d4858766a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import paddle.v2.dataset.mnist as mnist
+
+
+class TestMultipleReader(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 64
+        self.pass_num = 3
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = paddle.batch(mnist.train(), batch_size=self.batch_size)
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batch = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist.recordio', data_file, feeder)
+
+    def test_main(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.open_recordio_file(
+                filename='./mnist.recordio',
+                shapes=[(-1, 784), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            data_file = fluid.layers.create_multi_pass_reader(
+                reader=data_file, pass_num=self.pass_num)
+            img, label = fluid.layers.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            batch_count = 0
+            while not data_file.eof():
+                img_val, = exe.run(fetch_list=[img])
+                batch_count += 1
+                self.assertLessEqual(img_val.shape[0], self.batch_size)
+            data_file.reset()
+            self.assertEqual(batch_count, self.num_batch * self.pass_num)
diff --git a/python/paddle/fluid/tests/unittests/test_multiple_reader.py b/python/paddle/fluid/tests/unittests/test_multiple_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..69f8acf81efaba8fc0f3df4cfe3a42dc4e477df2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multiple_reader.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import paddle.v2.dataset.mnist as mnist
+from shutil import copyfile
+
+
+class TestMultipleReader(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 64
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=self.batch_size)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batch = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist_0.recordio', reader, feeder)
+        copyfile('./mnist_0.recordio', './mnist_1.recordio')
+        copyfile('./mnist_0.recordio', './mnist_2.recordio')
+
+    def main(self, thread_num):
+        file_list = [
+            './mnist_0.recordio', './mnist_1.recordio', './mnist_2.recordio'
+        ]
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_files = fluid.layers.open_files(
+                filenames=file_list,
+                thread_num=thread_num,
+                shapes=[(-1, 784), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            img, label = fluid.layers.read_file(data_files)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            batch_count = 0
+            while not data_files.eof():
+                img_val, = exe.run(fetch_list=[img])
+                batch_count += 1
+                self.assertLessEqual(img_val.shape[0], self.batch_size)
+            data_files.reset()
+            self.assertEqual(batch_count, self.num_batch * 3)
+
+    def test_main(self):
+        self.main(thread_num=3)  # thread number equals to file number
+        self.main(thread_num=10)  # thread number is larger than file number
+        self.main(thread_num=2)  # thread number is less than file number
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 964d78f1966aa10e36eeaabe943d44e002d50293..764fa575fba1615de3171e848890b3836e640849 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -78,20 +78,22 @@ def avg_pool2D_forward_naive(x,
 
 class TestPool2d_Op(OpTest):
     def setUp(self):
+        self.op_type = "pool2d"
         self.use_cudnn = False
         self.use_mkldnn = False
+        self.dtype = np.float32
         self.init_test_case()
         self.init_global_pool()
-        self.init_op_type()
+        self.init_kernel_type()
         self.init_pool_type()
         self.init_ceil_mode()
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
-        input = np.random.random(self.shape).astype("float32")
+        input = np.random.random(self.shape).astype(self.dtype)
         output = self.pool2D_forward_naive(input, self.ksize, self.strides,
                                            self.paddings, self.global_pool,
-                                           self.ceil_mode).astype("float32")
-        self.inputs = {'X': input}
+                                           self.ceil_mode).astype(self.dtype)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
             'strides': self.strides,
@@ -105,7 +107,7 @@ class TestPool2d_Op(OpTest):
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
 
-        self.outputs = {'Out': output.astype('float32')}
+        self.outputs = {'Out': output}
 
     def test_check_output(self):
         if self.use_cudnn:
@@ -115,6 +117,8 @@ class TestPool2d_Op(OpTest):
             self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         if self.use_cudnn and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
@@ -128,8 +132,8 @@ class TestPool2d_Op(OpTest):
         self.strides = [1, 1]
         self.paddings = [0, 0]
 
-    def init_op_type(self):
-        self.op_type = "pool2d"
+    def init_kernel_type(self):
+        pass
 
     def init_pool_type(self):
         self.pool_type = "avg"
@@ -149,9 +153,6 @@ class TestCase1(TestPool2d_Op):
         self.strides = [1, 1]
         self.paddings = [0, 0]
 
-    def init_op_type(self):
-        self.op_type = "pool2d"
-
     def init_pool_type(self):
         self.pool_type = "avg"
         self.pool2D_forward_naive = avg_pool2D_forward_naive
@@ -167,9 +168,6 @@ class TestCase2(TestPool2d_Op):
         self.strides = [1, 1]
         self.paddings = [1, 1]
 
-    def init_op_type(self):
-        self.op_type = "pool2d"
-
     def init_pool_type(self):
         self.pool_type = "avg"
         self.pool2D_forward_naive = avg_pool2D_forward_naive
@@ -179,27 +177,18 @@ class TestCase2(TestPool2d_Op):
 
 
 class TestCase3(TestPool2d_Op):
-    def init_op_type(self):
-        self.op_type = "pool2d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase4(TestCase1):
-    def init_op_type(self):
-        self.op_type = "pool2d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase5(TestCase2):
-    def init_op_type(self):
-        self.op_type = "pool2d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
@@ -207,39 +196,105 @@ class TestCase5(TestCase2):
 
 #--------------------test pool2d--------------------
 class TestCUDNNCase1(TestPool2d_Op):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase1(TestPool2d_Op):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase2(TestCase1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase2(TestCase1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase3(TestCase2):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase3(TestCase2):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase4(TestCase3):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase4(TestCase3):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase5(TestCase4):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase5(TestCase4):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase6(TestCase5):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase6(TestCase5):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCeilModeCase1(TestCUDNNCase1):
@@ -264,39 +319,33 @@ class TestCeilModeCase4(TestCase2):
 
 #--------------------test pool2d MKLDNN--------------------
 class TestMKLDNNCase1(TestPool2d_Op):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_mkldnn = True
-        self.op_type = "pool2d"
 
 
 class TestMKLDNNCase2(TestCase1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_mkldnn = True
-        self.op_type = "pool2d"
 
 
 class TestMKLDNNCase3(TestCase2):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_mkldnn = True
-        self.op_type = "pool2d"
 
 
 class TestMKLDNNCase4(TestCase3):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_mkldnn = True
-        self.op_type = "pool2d"
 
 
 class TestMKLDNNCase5(TestCase4):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_mkldnn = True
-        self.op_type = "pool2d"
 
 
 class TestMKLDNNCase6(TestCase5):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_mkldnn = True
-        self.op_type = "pool2d"
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 957fa5d2c4a795cfd01047c1b7845674e4c1d549..7feb509c4d6f5768552fc2515081f7e68f420967 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -27,12 +27,36 @@ class TestSequenceExpand(OpTest):
     def compute(self):
         x = self.inputs['X']
         x_data, x_lod = x if type(x) == tuple else (x, None)
-        n = 1 + x_data.shape[0] if not x_lod else len(x_lod[0])
         y_data, y_lod = self.inputs['Y']
-        repeats = [((y_lod[-1][i + 1] - y_lod[-1][i]))
-                   for i in range(len(y_lod[-1]) - 1)]
-        out = x_data.repeat(repeats, axis=0)
-        self.outputs = {'Out': out}
+
+        if hasattr(self, 'attrs'):
+            ref_level = self.attrs['ref_level']
+        else:
+            ref_level = len(y_lod) - 1
+
+        out = np.zeros(shape=((0, ) + x_data.shape[1:]), dtype=x_data.dtype)
+
+        if x_lod is None:
+            x_idx = [i for i in xrange(x_data.shape[0] + 1)]
+        else:
+            x_idx = x_lod[0]
+            out_lod = [[0]]
+
+        for i in xrange(1, len(y_lod[ref_level])):
+            repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]
+            x_len = x_idx[i] - x_idx[i - 1]
+            if repeat_num > 0:
+                x_sub = x_data[x_idx[i - 1]:x_idx[i], :]
+                x_sub = np.repeat(x_sub, repeat_num, axis=0)
+                out = np.vstack((out, x_sub))
+                if x_lod is not None:
+                    for j in xrange(repeat_num):
+                        out_lod[0].append(out_lod[0][-1] + x_len)
+
+        if x_lod is None:
+            self.outputs = {'Out': out}
+        else:
+            self.outputs = {'Out': (out, out_lod)}
 
     def setUp(self):
         self.op_type = 'sequence_expand'
@@ -52,7 +76,8 @@ class TestSequenceExpandCase1(TestSequenceExpand):
         x_lod = [[0, 2, 5]]
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
         y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
+        self.attrs = {'ref_level': 0}
 
 
 class TestSequenceExpandCase2(TestSequenceExpand):
@@ -60,8 +85,9 @@ class TestSequenceExpandCase2(TestSequenceExpand):
         x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
         x_lod = [[0, 1]]
         y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
-        y_lod = [[0, 2]]
+        y_lod = [[0, 2], [0, 2]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+        self.attrs = {'ref_level': 0}
 
 
 class TestSequenceExpandCase3(TestSequenceExpand):
@@ -75,14 +101,9 @@ class TestSequenceExpandCase3(TestSequenceExpand):
 
 class TestSequenceExpandCase4(TestSequenceExpand):
     def set_data(self):
-        x_data = np.array(
-            [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape(
-                [2, 5]).astype('float32')
-        x_lod = [[
-            0,
-            1,
-            2,
-        ]]
+        data = [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]
+        x_data = np.array(data).reshape([5, 2]).astype('float32')
+        x_lod = [[0, 2, 5]]
         y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
         y_lod = [[0, 1, 2], [0, 1, 2]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 4f20da2b926823db9e7ec92c95178b6d3d1feec9..33d60c7e31ce0817ad26ea1c1c974339936052d3 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -29,15 +29,20 @@ class TestSoftmaxOp(OpTest):
     def setUp(self):
         self.op_type = "softmax"
         self.use_cudnn = False
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
-        }
-        self.outputs = {
-            'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
+        self.use_mkldnn = False
+        self.dtype = np.float32
+        self.init_kernel_type()
+
+        x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, 1, x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
         }
-        self.attrs = {'use_cudnn': self.use_cudnn, }
 
-    def init_op_type(self):
+    def init_kernel_type(self):
         pass
 
     def test_check_output(self):
@@ -48,6 +53,8 @@ class TestSoftmaxOp(OpTest):
             self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         if self.use_cudnn:
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
@@ -57,8 +64,25 @@ class TestSoftmaxOp(OpTest):
 
 
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
-    def init_op_type(self):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+
+
+class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
         self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
+
+class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index a369783245ae2e35a9743ef1f4321ac919e58283..379081c3287ce81dbf2bd7307cb5eac2620b13db 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -126,7 +126,6 @@ class TestTensor(unittest.TestCase):
     def test_lod_tensor_gpu_init(self):
         if not core.is_compiled_with_cuda():
             return
-        scope = core.Scope()
         place = core.CUDAPlace(0)
         lod_py = [[0, 2, 5], [0, 2, 4, 5]]
         lod_tensor = core.LoDTensor()
@@ -144,6 +143,25 @@ class TestTensor(unittest.TestCase):
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
         self.assertListEqual(lod_py, lod_tensor.lod())
 
+    def test_empty_tensor(self):
+        place = core.CPUPlace()
+        scope = core.Scope()
+        var = scope.var("test_tensor")
+
+        tensor = var.get_tensor()
+
+        tensor.set_dims([0, 1])
+        tensor.alloc_float(place)
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((0, 1), tensor_array.shape)
+
+        if core.is_compiled_with_cuda():
+            gpu_place = core.CUDAPlace(0)
+            tensor.alloc_float(gpu_place)
+            tensor_array = numpy.array(tensor)
+            self.assertEqual((0, 1), tensor_array.shape)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index eac2cb316835fda0a52ac9895eaa80914d0f1e5b..3684d1e8f73a21d9c6f2a5985f8b40ed6984057b 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2747,17 +2747,17 @@ def img_pool_layer(input,
 
     ..  math::
 
-        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
+        w & = 1 + ceil(\\frac{input\_width + 2 * padding - pool\_size}{stride})
 
-        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+        h & = 1 + ceil(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y})
 
     - ceil_mode=False:
 
     ..  math::
 
-        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
+        w & = 1 + floor(\\frac{input\_width + 2 * padding - pool\_size}{stride})
 
-        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+        h & = 1 + floor(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y})
 
     The example usage is:
 
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 421f6c933d7032e4103f504fc509e2d5c89149b2..fda5246d74f598200b439774a25e80ec3e504077 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could
 be used in user program.
 """
 
-__all__ = ['np_array', 'text_file', "cloud_reader"]
+__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
 
 
 def np_array(x):