Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into cpp_parallel_executor

50e7e25d · Yu Yang · 5c7a5233 · 4f522fa8 · 50e7e25d · 50e7e25d
173 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
@@ -181,6 +182,11 @@ if(WITH_GPU)
    include(cuda)
 endif(WITH_GPU)
+if(WITH_AMD_GPU)
+    find_package(HIP)
+    include(hip)
+endif(WITH_AMD_GPU)
 if(WITH_MKLML)
    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()

--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -18,12 +18,13 @@ import sys
 import time
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
-import paddle.v2.fluid.profiler as profiler
+import paddle.fluid.profiler as profiler
 import argparse
 import functools
 import os
+from paddle.fluid import debuger
 def str2bool(v):
@@ -182,28 +183,27 @@ def main():
            start_time = time.time()
            num_samples = 0
            train_pass_acc.reset()
-            with profiler.profiler("CPU", 'total') as prof:
+            for batch_id, data in enumerate(train_reader()):
-                for batch_id, data in enumerate(train_reader()):
+                ts = time.time()
-                    ts = time.time()
+                img_data = np.array(
-                    img_data = np.array(
+                    map(lambda x: x[0].reshape(data_shape), data)).astype(
-                        map(lambda x: x[0].reshape(data_shape), data)).astype(
+                        "float32")
-                            "float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = y_data.reshape([-1, 1])
-                    y_data = y_data.reshape([-1, 1])
+                loss, acc, b_size = exe.run(
-                    loss, acc, b_size = exe.run(
+                    trainer_prog,
-                        trainer_prog,
+                    feed={"pixel": img_data,
-                        feed={"pixel": img_data,
+                          "label": y_data},
-                              "label": y_data},
+                    fetch_list=[avg_cost, batch_acc, batch_size])
-                        fetch_list=[avg_cost, batch_acc, batch_size])
+                iters += 1
-                    iters += 1
+                num_samples += len(data)
-                    num_samples += len(data)
+                train_pass_acc.add(value=acc, weight=b_size)
-                    train_pass_acc.add(value=acc, weight=b_size)
+                print(
-                    print(
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
-                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+                    % (pass_id, iters, loss, acc,
-                        % (pass_id, iters, loss, acc,
+                       len(data) / (time.time() - ts))
-                           len(data) / (time.time() - ts))
+                )  # The accuracy is the accumulation of batches, but not the current batch.
-                    )  # The accuracy is the accumulation of batches, but not the current batch.
            pass_elapsed = time.time() - start_time
            pass_train_acc = train_pass_acc.eval()
@@ -254,9 +254,7 @@ def main():
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
-            print("starting server side startup")
            exe.run(pserver_startup)
-            print("starting parameter server...")
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            # Parameter initialization

--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
@@ -292,14 +292,18 @@ def run_benchmark(cluster_spec, server):
        return np.mean(test_accs)
    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+        intra_op_parallelism_threads=1,
+        inter_op_parallelism_threads=1,
+        log_device_placement=True)
    config.gpu_options.allow_growth = True
    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
    with tf.train.MonitoredTrainingSession(
-            master=server.target, is_chief=(args.task_index == 0),
+            master=server.target,
-            hooks=hooks) as sess:
+            is_chief=(args.task_index == 0),
+            hooks=hooks,
+            config=config) as sess:
        iters, num_samples, start_time = 0, 0, 0.0
        for pass_id in range(args.num_passes):
            # train

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -57,11 +57,7 @@ if(NOT WITH_GOLANG)
    add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)
-if(NOT WITH_GPU)
+if(WITH_GPU)
-    add_definitions(-DHPPL_STUB_FUNC)
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
    add_definitions(-DPADDLE_WITH_CUDA)
    FIND_PACKAGE(CUDA REQUIRED)
@@ -84,7 +80,14 @@ else()
    # Include cuda and cudnn
    include_directories(${CUDNN_INCLUDE_DIR})
    include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
+elseif(WITH_AMD_GPU)
+    add_definitions(-DPADDLE_WITH_HIP)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
+else()
+    add_definitions(-DHPPL_STUB_FUNC)
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+endif()
 if (WITH_MKLML AND MKLML_IOMP_LIB)
    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -24,7 +24,7 @@ set(BOOST_PROJECT       "extern_boost")
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR           "boost_1_41_0")
-set(BOOST_URL           "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz")
+set(BOOST_URL           "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -4,18 +4,33 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
-ExternalProject_Add(
+if(WITH_AMD_GPU)
-    extern_eigen3
+    ExternalProject_Add(
-    ${EXTERNAL_PROJECT_LOG_ARGS}
+        extern_eigen3
-    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+        ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-    PREFIX          ${EIGEN_SOURCE_DIR}
+        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
-    UPDATE_COMMAND  ""
+        PREFIX          ${EIGEN_SOURCE_DIR}
-    CONFIGURE_COMMAND ""
+        UPDATE_COMMAND  ""
-    BUILD_COMMAND     ""
+        CONFIGURE_COMMAND ""
-    INSTALL_COMMAND   ""
+        BUILD_COMMAND     ""
-    TEST_COMMAND      ""
+        INSTALL_COMMAND   ""
-)
+        TEST_COMMAND      ""
+    )
+else()
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+        GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+endif()
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -317,6 +317,82 @@ function(nv_test TARGET_NAME)
  endif()
 endfunction(nv_test)
+function(hip_library TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_library_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    if(hip_library_SRCS)
+      if (hip_library_SHARED OR hip_library_shared) # build *.so
+        add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+      else()
+        add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
+	find_fluid_modules(${TARGET_NAME})
+      endif()
+      if (hip_library_DEPS)
+	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
+	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${hip_library_SRCS})
+	string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+	if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
+    else(hip_library_SRCS)
+      if (hip_library_DEPS)
+	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+      else()
+	message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(hip_library_SRCS)
+  endif()
+endfunction(hip_library)
+function(hip_binary TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
+    if(hip_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+    endif()
+  endif()
+endfunction(hip_binary)
+function(hip_test TARGET_NAME)
+  if (WITH_AMD_GPU AND WITH_TESTING)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_test_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
+    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+  endif()
+endfunction(hip_test)
 function(go_library TARGET_NAME)
  set(options STATIC static SHARED shared)
  set(oneValueArgs "")

--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
+if(NOT WITH_AMD_GPU)
+    return()
+endif()
+include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hipblas/include")
+include_directories("/opt/rocm/hiprand/include")
+include_directories("/opt/rocm/rocrand/include")
+include_directories("/opt/rocm/rccl/include")
+include_directories("/opt/rocm/thrust")
+list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+if(WITH_DSO)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
+endif(WITH_DSO)
+if(WITH_DOUBLE)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
+endif(WITH_DOUBLE)
+if(WITH_TESTING)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
+endif(WITH_TESTING)
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+if("x${HCC_HOME}" STREQUAL "x")
+  set(HCC_HOME "/opt/rocm/hcc")
+endif()
+set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
+set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
 add_subdirectory(v2)
+add_subdirectory(fluid)
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+sphinx_add_target(paddle_fluid_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+# HTML output directory
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+    "${BINARY_BUILD_DIR_CN}/conf.py"
+    @ONLY)
+sphinx_add_target(paddle_fluid_docs_cn
+                  html
+                  ${BINARY_BUILD_DIR_CN}
+                  ${SPHINX_CACHE_DIR_CN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_CN})
--- a/doc/fluid/build_and_install/index_cn.rst
+++ b/doc/fluid/build_and_install/index_cn.rst
+安装与使用
+------------
--- a/doc/fluid/build_and_install/index_en.rst
+++ b/doc/fluid/build_and_install/index_en.rst
+Build and Install
+------------
--- a/doc/design/images/multiple_reader.png
+++ b/doc/design/images/multiple_reader.png
--- a/doc/design/images/readers.png
+++ b/doc/design/images/readers.png
--- a/doc/fluid/design/concurrent/images/select_op_workflow.png
+++ b/doc/fluid/design/concurrent/images/select_op_workflow.png
--- a/doc/fluid/design/concurrent/select_op.md
+++ b/doc/fluid/design/concurrent/select_op.md
+# select_op Design
+## Introduction
+In golang, the [**select**](https://golang.org/ref/spec#Select_statements) 
+statement lets a goroutine wait on multiple communication operations at the 
+same time. The **select** blocks until one of its cases can run, then 
+executes the case.  If multiple cases are ready to run, then one case is 
+choosen at random to be executed.
+With the introduction of CSP for Paddle, we mimic this behavior by 
+creating a ***select_op***.
+## How to use it
+The **select_op** is available as a c++ operator.  However most users
+will prefer to use the much simplier Python API.
+- **fluid.Select()**: Creates a select operator and adds it to the current
+block within the main program.  Also creates a sub block and adds it to the 
+main program.  This sub block is used to hold all variables and operators 
+used by the case statements.
+Within the select block, users can add cases by 
+calling **select.case** or **select.default** method.
+- **fluid.Select.case(channel_action, channel, result_variable)**: Represents
+a fluid channel send/recv case.  This method creates a SelectCase block
+guard and adds it to the Select block.  The arguments into this method tells
+the select which channel operation to listen to.
+- **fluid.Select.default()**: Represents the fluid default case.  This default
+case is executed if none of the channel send/recv cases are available to
+execute.
+**Example:**
+```
+ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
+while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
+while_op = While(cond=while_cond)    
+with while_op.block():
+    with fluid.Select() as select:
+        with select.case(fluid.channel_send, channel, x):
+            # Send x, then perform Fibonacci calculation on x and y
+            x_tmp = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+            assign(input=x, output=x_tmp)
+            assign(input=y, output=x)
+            assign(elementwise_add(x=x_tmp, y=y), output=y)
+        with select.case(fluid.channel_recv, quit_channel, result2):
+            # Exit out of While loop
+            while_false = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
+            helper = layer_helper.LayerHelper('assign')
+            helper.append_op(
+                type='assign',
+                inputs={'X': [while_false]},
+                outputs={'Out': [while_cond]})
+```
+## How it Works
+### Program Description
+```
+blocks {
+  idx: 0
+  ...
+  // Create "case_to_execute" variable
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_110.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: -1.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 2
+    }
+  }
+  // Create "select" operator.
+  // inputs: 
+  //   X: All input variables used by operators within the select block
+  //   case_to_execute: Variable filled in by select_op when it determines
+  //     which case to execute.
+  //  
+  // outputs:
+  //   Out: All output variables referenced by operators within select block. 
+  // 
+  // attrs:
+  //   sub_block: The block id containing the select "cases"
+  //   cases:  Serialized list of all cases in the select op. 
+  //     Each case is serialized as: '<index>,<type>,<channel>,<value>'
+  //     where type is 0 for default, 1 for send, and 2 for receive.
+  //     No channel and values are needed for default cases.
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_103.tmp_0"
+      arguments: "fill_constant_104.tmp_0"
+    }
+    inputs {
+      parameter: "case_to_execute"
+      arguments: "fill_constant_110.tmp_0"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_110.tmp_0"
+    }    
+    type: "select"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+    attrs {
+      name: "cases"
+      type: STRINGS
+      strings: "0,1,channel_101,fill_constant_109.tmp_0"
+      strings: "1,2,channel_102,fill_constant_108.tmp_0"
+    }
+  }
+  ...
+}
+```
+The python select API will add the **select_op** to the current block.  In addition, it will
+iterate through all it's case statements and add any input variables required by case statements
+into **X**.  It will also create a temp variable called **case_to_execute**.  This variable is
+filled in by the select_op after it has completed processing the case statements.
+If there are no available cases to execute (ie: all cases are blocked on channel operations, and
+there is no default statement), then the select_op will block the current thread.  The thread will 
+unblock once there is a channel operation affecting one of the case statements, at which point, the
+**select_op** will set the **case_to_execute** variable to the index of the case to execute.
+Finally the select_op will call executor.run on the **sub_block**.
+```
+blocks {
+  idx: 1
+  parent_idx: 0
+  ...
+  // Fill a tensor with the case index (ie: 0,1,2,3,ect.)
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_111.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 0.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 2
+    }
+  }
+  // Create an "equal" operator to compare the case index with the "case_to_execute"
+  // tensor (which was filled in by the select op).
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_111.tmp_0"  // case 0
+    }
+    inputs {
+      parameter: "Y"
+      arguments: "fill_constant_110.tmp_0"  // case_to_execute
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "equal_0.tmp_0"
+    }
+    type: "equal"
+    attrs {
+      name: "axis"
+      type: INT
+      i: -1
+    }
+  }
+  // Use the output of the "equal" operator as a condition for the "conditional_block".
+  // If the condition evaluates to true, then execute the "sub_block" (which represents
+  // the select case's body)
+  ops {
+    inputs {
+      parameter: "Params"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "equal_0.tmp_0"
+    }
+    outputs {
+      parameter: "Out"
+    }
+    outputs {
+      parameter: "Scope"
+      arguments: "_generated_var_0"
+    }
+    type: "conditional_block"
+    attrs {
+      name: "is_scalar_condition"
+      type: BOOLEAN
+      b: true
+    }
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 4
+    }
+  }
+  ...
+  // Repeat the above operators for each case statements inside the select body
+}
+```
+Cases are represented by a **conditional_block operator**, whose's condition is set as the output of 
+equal(**case_to_execute**, **case_index**).  Since each case index is unique in this sub-block, 
+only one case will be executed.
+### select_op flow
+<p align="center">
+<img src="./images/select_op_workflow.png"/><br/>
+</p>
+The select algorithm is inspired by golang's select routine.  Please refer to 
+http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
+## Backward Pass
+TODO
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+## Design Doc: Distributed Lookup Table Operator
+A lookup table operator in PaddlePaddle where the table could be out
+of the memory of a computer.
+## Background
+A lookup table operator is well-used in deep learning for learning the
+representation, or the
+[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of
+symbols.
+### The Forward Algorithm
+The forward algorithm of the lookup table is a multiplication of the
+input vector x and the lookup table matrix W:
+$$y = x * W$$
+When x is a sparse vector of symbols, the above multiplication
+simplifies into looking up rows in W that correspond to symbols in x,
+denoted by W(x).  Please be aware that W could be huge and out of the
+memory, so we'd need a distributed storage service, which supports the
+lookup of rows.
+The following figure illustrates the multiplication of x with two
+non-zero elements, or say, two symbols, and a lookup table W:
+![lookup table](./src/lookup_table.png)
+### The Backward Algorithm
+The backward algorithm computes W'(x) using W(x).  W'(x) has the same
+scale of size as W(x) and is much smaller than W.
+To optimize W given W', we can do simple SGD update:
+$$W = f(W') = \lambda * W'$$
+or some more sophisticated algorithms that rely on both W' and W:
+$$W = f(W, W')$$
+The following figure illustrates the backward pass of the lookup
+operator: ![lookup table training](./src/lookup_table_training.png)
+## Distributed Storage Service
+The forward algorithm requires a distributed storage service for W.
+The backward algorithm prefers that the storage system can apply the
+optimization algorithm on W.  The following two sections describe two
+solutions -- the former doesn't require that the storage service can
+do optimization, the latter does.
+### Storage Service Doesn't Optimize
+In this design, we use highly-optimized distributed storage, e.g.,
+memcached, as the storage service, and we run the optimization
+algorithm on parameter servers of PaddlePaddle.  The following figure
+illustrates the training process.
+<!--
+Note: please update the following URL when update this digraph.
+<img src='https://g.gravizo.com/svg?
+digraph G {
+  rankdir="LR";
+  subgraph cluster1 {
+  P1 [label="pserver 1"];
+  P2 [label="pserver 2"];
+  T1 [label="trainer 1"];
+  T2 [label="trainer 2"];
+  T3 [label="trainer 3"];
+  }
+  KV [label="memcached"];
+  T1 -> P1;
+  T1 -> P2;
+  T2 -> P1;
+  T2 -> P2;
+  T3 -> P1;
+  T3 -> P2;
+  P1 -> KV [color=gray, weight=0.1];
+  KV -> P1 [color=gray, weight=0.1];
+  P2 -> KV [color=gray, weight=0.1];
+  KV -> P2 [color=gray, weight=0.1];
+  KV -> T1 [color=gray, weight=0.1];
+  KV -> T2 [color=gray, weight=0.1];
+  KV -> T3 [color=gray, weight=0.1];
+}
+)
+'/>
+-->
+<img src='https://g.gravizo.com/svg?%20digraph%20G%20{%20rankdir=%22LR%22;%20subgraph%20cluster1%20{%20P1%20[label=%22pserver%201%22];%20P2%20[label=%22pserver%202%22];%20T1%20[label=%22trainer%201%22];%20T2%20[label=%22trainer%202%22];%20T3%20[label=%22trainer%203%22];%20}%20KV%20[label=%22memcached%22];%20T1%20-%3E%20P1;%20T1%20-%3E%20P2;%20T2%20-%3E%20P1;%20T2%20-%3E%20P2;%20T3%20-%3E%20P1;%20T3%20-%3E%20P2;%20P1%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P1%20[color=gray,%20weight=0.1];%20P2%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T1%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T3%20[color=gray,%20weight=0.1];%20}'/>
+Each trainer runs the forward and backward passes using their local
+data:
+1. In the forward pass, when a trainer runs the forward algorithm of a
+   lookup operator, it retrieves W(x) from the storage service.
+1. The trainer computes W'(x) in the backward pass using W(x).
+During the global update process:
+1. Each trainer uploads its W'(x) to parameter servers.
+1. The parameter server runs the optimization algorithm, e.g., the
+   Adam optimization algorithm, which requires that
+   1. The parameter server retrieves W(x) from memcached, and
+   1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
+      W'(x))$ to memcached, where $f$ denotes the optimization
+      algorithm.
+### Storage Service Does Optimize
+This design is very similar to the above one, except that the
+optimization algorithm $f$ runs on the storage service.
+- Pro: parameter servers do not retrieve W(x) from the storage
+  service, thus saves half network communication.
+- Con: the storage service needs to be able to run the optimization
+  algorithm.
+## Conclusion
+Let us do the "storage service does not optimize" solution first, as a
+baseline at least, because it is easier to use a well-optimized
+distributed storage service like memcached.  We can do the "storage
+service does optimize" solution later or at the same time, which, if
+implemented carefully, should have better performance than the former.
--- a/doc/fluid/design/dist_train/src/lookup_table.png
+++ b/doc/fluid/design/dist_train/src/lookup_table.png
--- a/doc/fluid/design/dist_train/src/lookup_table_training.png
+++ b/doc/fluid/design/dist_train/src/lookup_table_training.png
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
 ## RNN Algorithm Implementation
 <p align="center">
-<img src="./images/rnn.jpg"/>
+<img src="./rnn.jpg"/>
 </p>
 The above diagram shows an RNN unrolled into a full network.
@@ -22,7 +22,7 @@ There are several important concepts here:
 There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
 <p align="center">
-<img src="./images/rnn.png"/><br/>
+<img src="./rnn.png"/><br/>
 Figure 2 illustrates the RNN's data flow
 </p>
@@ -49,7 +49,7 @@ or copy the memory value of the previous step to the current ex-memory variable.
 ### Usage in Python
-For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md).
 We can define an RNN's step-net using a Block:
@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
 The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
 <p align="center">
-<img src="./images/2_level_rnn.png"/>
+<img src="./2_level_rnn.png"/>
 </p>
 ```python
@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
 <p align="center">
-<img src="images/rnn_2level_data.png"/>
+<img src="./rnn_2level_data.png"/>
 </p>
--- a/doc/fluid/design/index_cn.rst
+++ b/doc/fluid/design/index_cn.rst
+设计思想
+------------
--- a/doc/fluid/design/index_en.rst
+++ b/doc/fluid/design/index_en.rst
+Design
+------------
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -103,7 +103,7 @@ In computability theory, a system of data-manipulation rules, such as a programm
 There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
-There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
 Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).

--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
+开发标准
+------------
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
+Development
+------------
+This is Development page
--- a/doc/fluid/faq/index_cn.rst
+++ b/doc/fluid/faq/index_cn.rst
+FAQ
+------------
--- a/doc/fluid/faq/index_en.rst
+++ b/doc/fluid/faq/index_en.rst
+FAQ
+------------
--- a/doc/fluid/getstarted/index_cn.rst
+++ b/doc/fluid/getstarted/index_cn.rst
+新手入门
+------------
+新手入门
--- a/doc/fluid/getstarted/index_en.rst
+++ b/doc/fluid/getstarted/index_en.rst
+GET STARTED
+------------
+This is get started page
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+# Fluid 分布式版本使用指南
+本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行，以及将单机训练脚本改造成支持集群训练的版本
+## 准备工作
+* 可用的集群
+    包含一个或多个计算节点的集群，每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址，集群内的所有计算节点可以通过网络相互通信。
+* 安装PaddlePaddle Fluid with Distribution版本
+    所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。
+    **注意：**当前对外提供的PaddlePaddle版本并不支持分布式，需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。
+    cmake编译命令中需要将WITH_DISTRIBUTE设置为ON，下面是一个cmake编译指令示例：
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+## 更新训练脚本
+这里，我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例，描述如何将单机训练脚本改造成支持集群训练的版本。
+### 单机训练脚本示例
+```python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+BATCH_SIZE = 20
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+exe.run(fluid.default_startup_program())
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+我们创建了一个简单的全连接神经网络程序，并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。
+### 介绍Parameter Server
+在非分布式版本的训练脚本中，只存在Trainer一种角色，它不仅处理常规的计算任务，也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中，由于存在多个Trainer节点进行同样的数据计算任务，因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中，我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md)
+**因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
+### 分布式训练 
+Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
+```python
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) 
+```
+将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
+```python
+... #define the program, cost, and create sgd optimizer
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+... #create executor
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+```
+### 分布式训练脚本运行说明
+分布式任务的运行需要将表格中说明的多个参数进行赋值:
+| 参数名 | 值类型 | 说明 | 示例 |
+|:-------------|:------|:---------------------------------------|:-------------|
+| trainer_id | int | 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 | 0/1/2/3 |
+| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
+| trainers | int | 训练节点的总个数，>0的数字 | 4 |
+| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
+| training_role | str | 节点角色， TRAINER/PSERVER | PSERVER |
+**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下： 
+```python
+t = fluid.DistributeTranspiler()
+t.transpile(
+    optimize_ops,
+    params_grads,
+    trainer_id,
+    pservers=pserver,
+    trainers=trainers)
+if training_role == "PSERVER":
+    pserver_prog = t.get_pserver_program(server_endpoint)
+    pserver_startup = t.get_startup_program(server_endpoint, pserver_prog)
+```
+### Demo
+完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。
+第一步，进入demo代码所在目录：
+```bash
+cd /paddle/python/paddle/fluid/tests/book
+```
+第二步，启动Parameter Server：
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+```
+执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
+第三步，启动Trainer：
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+```
+由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。
+现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
+进阶使用
+------------
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
+HOW TO
+------------
+This is how to page
--- a/doc/fluid/index_cn.rst
+++ b/doc/fluid/index_cn.rst
+ PaddlePaddle Fluid
+==========================
+..  toctree::
+  :maxdepth: 1
+  getstarted/index_cn.rst
+  design/index_cn.rst
+  build_and_install/index_cn.rst
+  howto/index_cn.rst
+  dev/index_cn.rst
+  faq/index_cn.rst
--- a/doc/fluid/index_en.rst
+++ b/doc/fluid/index_en.rst
+ PaddlePaddle Fluid
+==========================
+..  toctree::
+  :maxdepth: 1
+  getstarted/index_en.rst
+  design/index_en.rst
+  build_and_install/index_en.rst
+  howto/index_en.rst
+  dev/index_en.rst
+  faq/index_en.rst
--- a/doc/v2/api/fluid/optimizer.rst
+++ b/doc/v2/api/fluid/optimizer.rst
@@ -47,3 +47,10 @@ DecayedAdagrad
    :members:
    :noindex:
+Adadelta
+--------------
+..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
+    :members:
+    :noindex:
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -2,13 +2,14 @@
 如何贡献文档
 #############
-PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成，也可以利用paddlepaddle.org工具来编译和预览文档。
+PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的，PaddlePaddle.org工具可以帮助我们实现这一编译过程，并提供更好的预览效果。
 如何构建文档
 ============
 PaddlePaddle的文档构建有两种方式，分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具，两种方式都有各自的优点，前者方便预览，后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。
+我们建议使用PaddlePaddle.org工具来构建文档。
 使用PaddlePaddle.org工具
 ------------------------
@@ -31,7 +32,7 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
 注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档
 编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
 如果不想使用Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
@@ -56,7 +57,7 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
    python manage.py runserver
 工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。
 编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
 想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
@@ -96,7 +97,7 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
   python -m SimpleHTTPServer 8088
-在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
 ..  image:: src/doc_en.png
    :align: center

--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -2,21 +2,20 @@
 Contribute Documentation
 ########################
-PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
+PaddlePaddle's documentation includes both Chinese and English versions. The documentation is built using the ``cmake`` command to drive the ``sphinx`` compiler. The PaddlePaddle.org tool helps us to implement this compilation process and provides better preview results.
-Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
-When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
-How to Build Documentations
+How to build Documentation
-============
+===========================
-We recommend using PaddlePaddle.org tool to build documentation
+PaddlePaddle's documentation is built in two ways: using the PaddlePaddle.org tool and without using it. Both methods have their own advantages. The former facilitates previewing, while the latter facilitates debugging by the developer. We could choose to build the documentation with Docker or without it in each of the above ways.
+We recommend using PaddlePaddle.org tool to build documentation.
-Use PaddlePaddle.org tool
+Using PaddlePaddle.org tool
--------------
+-----------------------------
-This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+This is the recommended method to build documentation, because it can automatically compile the documentation and preview the documentation directly in a web page. Note that, although you can preview the documentation in other ways, its style may not be consistent with the official website. Compiling with the PaddlePaddle.org tool produces a preview that will be consistent with the official website documentation style.
-The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+The PaddlePaddle.org tool can be used with Docker and Docker needs to be installed first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After installing Docker, you may use the following commands to activate the tool
 ..  code-block:: bash
@@ -32,8 +31,8 @@ The tool uses Docker, please install it on your system. Please check Docker offi
    # Please specify the working directory through -v
    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
-Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run commands
-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation.
 The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
@@ -58,19 +57,62 @@ If you don't wish to use Docker, you can also activate the tool through Django.
    pip install -r requirements.txt
    python manage.py runserver
-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+Specify the PaddlePaddle working directory for the environment variable CONTENT_DIR so that the tool could find where the working directory is.
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation
 The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
-If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
+Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ for more information about the PaddlePaddle.org tool.
+Manually Building the Documentation
+-------------------------------------
+Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation.
+[TBD]
+If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
+.. code-block:: bash
+   mkdir paddle
+   cd paddle
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+   # If you only need to build documents, use the following commands
+   make -j $processors gen_proto_py
+   make -j $processors paddle_docs paddle_docs_cn
+   # If you only need to build APIs, use the following commands
+   make -j $processors gen_proto_py framework_py_proto
+   make -j $processors copy_paddle_pybind
+   make -j $processors paddle_api_docs
+$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
+After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIs，it will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands:
+.. code-block:: bash
+   python -m SimpleHTTPServer 8088
+Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
-How to write Documentations
+..  image:: src/doc_en.png
-============
+    :align: center
+    :scale: 60 %
-PaddlePaddle uses `sphinx`_ to compile documentations，Please check sphinx official website for more detail.
+How to write Documentation
+===========================
+PaddlePaddle uses `sphinx`_ to compile documentation，Please check sphinx official website for more detail.
 How to update www.paddlepaddle.org
-============================
+===================================
 Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
 PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and

--- a/doc/v2/faq/index_en.rst
+++ b/doc/v2/faq/index_en.rst
 FAQ
 ====
+This document provides answers to some of the frequently asked questions about PaddlePaddle. If you have a question that is not covered here, please go to `PaddlePaddle Community <https://github.com/PaddlePaddle/Paddle/issues>`_ , to find an answer or submit new `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_  , we will reply in time.
 ..  toctree::
  :maxdepth: 1

--- a/doc/v2/getstarted/index_en.rst
+++ b/doc/v2/getstarted/index_en.rst
 GET STARTED
 ============
+If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
 ..  toctree::
  :maxdepth: 1
  quickstart_en.rst
+While using PaddlePaddle to build applications, please understand some basic concepts.
+Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
+..  toctree::
+  :maxdepth: 1
  concepts/use_concepts_en.rst
--- a/doc/v2/howto/capi/index_en.rst
+++ b/doc/v2/howto/capi/index_en.rst
-C-API Prediction Library
+C-API Inference Library
 ========================
+After we train a neural network, we use it to do inference. Inference is the process of preparing input data and propagating it through the model to produce the result.
+Compared with model training, prediction has the following features:
+#. Inference does not require backpropagation and parameter updates, as required during training.
+#. Labels are not needed in prediction.
+#. Most of the time, predictions need to be integrated with the user system.
+Therefore, the model prediction SDK needs to be designed separately and has the following features:
+#. The predictive SDK does not include backpropagation and parameter updates to reduce the size of the SDK.
+#. The predictive SDK needs a simple user interface for ease of use.
+#. Since the input data may have a variety of structures, the format of the input data is clearly and compactly packaged.
+#. In order to be compatible with user's system, the SDK's interface must conform to the C-standard interface.
+PaddlePaddle provides C-API to solve the above problem. Following are the guidelines to use the C-API:
 ..  toctree::
  :maxdepth: 1

--- a/doc/v2/howto/cluster/index_en.rst
+++ b/doc/v2/howto/cluster/index_en.rst
@@ -2,6 +2,9 @@ Distributed Training
 ====================
 The effectiveness of the deep learning model is often directly related to the scale of the data: it can generally achieve better results after increasing the size of the dataset on the same model. However, it can not fit in one single computer when the amount of data increases to a certain extent. At this point, using multiple computers for distributed training is a natural solution. In distributed training, the training data is divided into multiple copies (sharding), and multiple machines participating in the training read their own data for training and collaboratively update the parameters of the overall model.
+Distributed training generally has framwork as shown below:
 .. image:: src/ps_en.png
   :width: 500

--- a/doc/v2/howto/index_en.rst
+++ b/doc/v2/howto/index_en.rst
 HOW TO
-=======
+========
+PaddlePaddle provides the users the ability to flexibly set various command line parameters to control the model training and inference process. Please refer to the following instructions on using PaddlePaddle:
 ..  toctree::
  :maxdepth: 1
  cmd_parameter/index_en.rst
+PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
+..  toctree::
+  :maxdepth: 1
  cluster/index_en.rst
+PaddlePaddle provides a C-API for inference. We provide the following guidelines  for using the C-API:
+..  toctree::
+  :maxdepth: 1
  capi/index_en.rst
+PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to：
+..  toctree::
+  :maxdepth: 1
  rnn/index_en.rst
+How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to：
+..  toctree::
+  :maxdepth: 1
  optimization/gpu_profiling_en.rst
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -36,7 +36,7 @@ target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
  ${CAPI_PRIVATE_HEADER})
-add_dependencies(paddle_capi paddle_proto)
+add_dependencies(paddle_capi paddle_proto paddle_gserver)
 # TODO: paddle_capi_whole will be removed.
 set(PADDLE_CAPI_LAYERS_LIBS

--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -871,3 +871,67 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
  ch->Reset<int>(0);
  ChannelHolderDestroyUnblockSenders(ch, false);
 }
+// This tests that closing a channelholder many times.
+void ChannelHolderManyTimesClose(ChannelHolder *ch) {
+  const int num_threads = 15;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  // Launches threads that try to send data to channel.
+  for (size_t i = 0; i < num_threads / 3; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended) {
+          int data = 10;
+          ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i]);
+  }
+  // Launches threads that try to receive data to channel.
+  for (size_t i = num_threads / 3; i < 2 * num_threads / 3; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          if (ch->Receive(&data)) {
+            EXPECT_EQ(data, 10);
+          }
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  // Launches threads that try to close the channel.
+  for (size_t i = 2 * num_threads / 3; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          if (!ch->IsClosed()) {
+            ch->close();
+          }
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  // Verify that all threads are unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+  EXPECT_TRUE(ch->IsClosed());
+  // delete the channel
+  delete ch;
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
+  // Check for Buffered Channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(10);
+  ChannelHolderManyTimesClose(ch);
+}
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -150,8 +150,9 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
  // Select block
  AddOp("select", {{"X", {dataChanName, quitChanName}},
                   {"case_to_execute", {"caseToExecute"}}},
-        {}, {{"sub_block", casesBlock},
+        {{"Out", {}}},
-             {"cases", std::vector<std::string>{case0Config, case1Config}}},
+        {{"sub_block", casesBlock},
+         {"cases", std::vector<std::string>{case0Config, case1Config}}},
        whileBlock);
  scope->Var("stepScopes");
@@ -209,9 +210,8 @@ TEST(Concurrency, Go_Op) {
  executor.Run(program, &scope, 0, true, true);
-  // After we call executor.run, the Go operator should do a channel_send to set
+  // After we call executor.run, the Go operator should do a channel_send to
-  // the
+  // set the "result" variable to 99.
-  // "result" variable to 99
  auto *finalData = tensor.data<int>();
  EXPECT_EQ(finalData[0], 99);
 }

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -14,12 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
-#include <set>
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -40,14 +36,13 @@ namespace {
 int kProgramId = -1;
 }  // namespace
-struct ExecutorPrepareContext {
+ExecutorPrepareContext::ExecutorPrepareContext(
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id)
+    const framework::ProgramDesc& prog, size_t block_id)
-      : prog_(prog), block_id_(block_id) {}
+    : prog_(prog), block_id_(block_id) {}
-  const framework::ProgramDesc& prog_;
+ExecutorPrepareContext::~ExecutorPrepareContext() {
-  size_t block_id_;
+  VLOG(5) << "destroy ExecutorPrepareContext";
-  std::vector<std::unique_ptr<OperatorBase>> ops_;
+}
-};
 Executor::Executor(const platform::Place& place) : place_(place) {}
@@ -101,9 +96,8 @@ static void CheckTensorNANOrInf(const std::string& name,
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool create_local_scope, bool create_vars) {
  platform::RecordBlock b(block_id);
-  auto* ctx = Prepare(pdesc, block_id);
+  auto ctx = Prepare(pdesc, block_id);
-  RunPreparedContext(ctx, scope, create_local_scope, create_vars);
+  RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
-  delete ctx;
 }
 // Check whether the block already has feed operators and feed_holder.
@@ -113,10 +107,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 // and feed_holder_name. Raise exception when any mismatch is found.
 // Return true if the block has feed operators and holder of matching info.
 static bool has_feed_operators(
-    BlockDesc* block, std::map<std::string, const LoDTensor*>& feed_targets,
+    const BlockDesc& block,
+    std::map<std::string, const LoDTensor*>& feed_targets,
    const std::string& feed_holder_name) {
  size_t feed_count = 0;
-  for (auto* op : block->AllOps()) {
+  for (auto* op : block.AllOps()) {
    if (op->Type() == kFeedOpType) {
      feed_count++;
      PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
@@ -135,7 +130,7 @@ static bool has_feed_operators(
        "The number of feed operators should match 'feed_targets'");
    // When feed operator are present, so should be feed_holder
-    auto var = block->FindVar(feed_holder_name);
+    auto var = block.FindVar(feed_holder_name);
    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
                            feed_holder_name);
    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
@@ -153,10 +148,10 @@ static bool has_feed_operators(
 // and fetch_holder_name. Raise exception when any mismatch is found.
 // Return true if the block has fetch operators and holder of matching info.
 static bool has_fetch_operators(
-    BlockDesc* block, std::map<std::string, LoDTensor*>& fetch_targets,
+    const BlockDesc& block, std::map<std::string, LoDTensor*>& fetch_targets,
    const std::string& fetch_holder_name) {
  size_t fetch_count = 0;
-  for (auto* op : block->AllOps()) {
+  for (auto* op : block.AllOps()) {
    if (op->Type() == kFetchOpType) {
      fetch_count++;
      PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
@@ -175,7 +170,7 @@ static bool has_fetch_operators(
        "The number of fetch operators should match 'fetch_targets'");
    // When fetch operator are present, so should be fetch_holder
-    auto var = block->FindVar(fetch_holder_name);
+    auto var = block.FindVar(fetch_holder_name);
    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
                            fetch_holder_name);
    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
@@ -190,12 +185,21 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
                   std::map<std::string, const LoDTensor*>& feed_targets,
                   std::map<std::string, LoDTensor*>& fetch_targets,
                   const std::string& feed_holder_name,
-                   const std::string& fetch_holder_name) {
+                   const std::string& fetch_holder_name, bool create_vars) {
  platform::RecordBlock b(kProgramId);
-  auto* copy_program = new ProgramDesc(program);
+  bool has_feed_ops =
+      has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
+  bool has_fetch_ops =
+      has_fetch_operators(program.Block(0), fetch_targets, fetch_holder_name);
+  ProgramDesc* copy_program = const_cast<ProgramDesc*>(&program);
+  if (!has_feed_ops || !has_fetch_ops) {
+    copy_program = std::unique_ptr<ProgramDesc>(new ProgramDesc(program)).get();
+  }
  auto* global_block = copy_program->MutableBlock(0);
-  if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
+  if (!has_feed_ops) {
    // create feed_holder variable
    auto* feed_holder = global_block->Var(feed_holder_name);
    feed_holder->SetType(proto::VarType::FEED_MINIBATCH);
@@ -228,7 +232,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
    }
  }
-  if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
+  if (!has_fetch_ops) {
    // create fetch_holder variable
    auto* fetch_holder = global_block->Var(fetch_holder_name);
    fetch_holder->SetType(proto::VarType::FETCH_LIST);
@@ -251,7 +255,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
    }
  }
-  Run(*copy_program, scope, 0, true, true);
+  Run(*copy_program, scope, 0, create_vars, create_vars);
  // obtain the data of fetch_targets from fetch_holder
  for (auto* op : global_block->AllOps()) {
@@ -262,19 +266,17 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
          GetFetchVariable(*scope, fetch_holder_name, idx);
    }
  }
-  delete copy_program;
 }
-ExecutorPrepareContext* Executor::Prepare(const ProgramDesc& program,
+std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
-                                          int block_id) {
+    const ProgramDesc& program, int block_id) {
  auto* ctx = new ExecutorPrepareContext(program, block_id);
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
  }
-  return ctx;
+  return std::unique_ptr<ExecutorPrepareContext>(ctx);
 }
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
@@ -313,22 +315,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }    // if (create_vars)
  for (auto& op : ctx->ops_) {
-    // TODO(ty):
-    // e.g. sgd should wait for allreduce to be finished
-    // if op's input is params' grad:
-    //     sync with allreduce stream
-    // SyncMultipleStreams(op);
-    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
-    op->Run(*local_scope, place_);
    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
+    op->Run(*local_scope, place_);
-    // TODO(ty):
-    // e.g. allreduce shoudl wait for fc_grad to be finished.
-    // if op's output is params' grad:
-    //     sync with computation stream
-    //     apply allreduce on allreduce stream
-    // SyncMultipleStreams(op);
    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -22,7 +22,17 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-struct ExecutorPrepareContext;
+extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
+struct ExecutorPrepareContext {
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
+  ~ExecutorPrepareContext();
+  const framework::ProgramDesc& prog_;
+  size_t block_id_;
+  std::vector<std::unique_ptr<OperatorBase>> ops_;
+};
 class Executor {
 public:
  // TODO(dzhwinter) : Do not rely on this function, it will be removed
@@ -45,11 +55,11 @@ class Executor {
           std::map<std::string, const LoDTensor*>& feed_targets,
           std::map<std::string, LoDTensor*>& fetch_targets,
           const std::string& feed_holder_name = "feed",
-           const std::string& fetch_holder_name = "fetch");
+           const std::string& fetch_holder_name = "fetch",
+           bool create_vars = true);
- private:
+  static std::unique_ptr<ExecutorPrepareContext> Prepare(
-  static ExecutorPrepareContext* Prepare(const ProgramDesc& program,
+      const ProgramDesc& program, int block_id);
-                                         int block_id);
  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          bool create_local_scope = true,
@@ -59,7 +69,5 @@ class Executor {
  const platform::Place place_;
 };
-extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@@ -26,6 +26,7 @@ namespace paddle {
 namespace framework {
 std::once_flag gflags_init_flag;
+std::once_flag p2p_init_flag;
 void InitGflags(std::vector<std::string> &argv) {
  std::call_once(gflags_init_flag, [&]() {
@@ -42,6 +43,27 @@ void InitGflags(std::vector<std::string> &argv) {
  });
 }
+void InitP2P(int count) {
+#ifdef PADDLE_WITH_CUDA
+  std::call_once(p2p_init_flag, [&]() {
+    for (int i = 0; i < count; ++i) {
+      for (int j = 0; j < count; ++j) {
+        if (i == j) continue;
+        int can_acess = -1;
+        PADDLE_ENFORCE(cudaDeviceCanAccessPeer(&can_acess, i, j),
+                       "Failed to test P2P access.");
+        if (can_acess != 1) {
+          LOG(WARNING) << "Cannot enable P2P access from " << i << " to " << j;
+        } else {
+          cudaSetDevice(i);
+          cudaDeviceEnablePeerAccess(j, 0);
+        }
+      }
+    }
+  });
+#endif
+}
 void InitDevices() {
  /*Init all avaiable devices by default */
@@ -63,7 +85,7 @@ void InitDevices() {
  for (int i = 0; i < count; ++i) {
    places.emplace_back(platform::CUDAPlace(i));
  }
+  InitP2P(count);
  platform::DeviceContextPool::Init(places);
 }

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -176,7 +176,7 @@ class Vector {
  // resize the vector
  void resize(size_t size) {
-    if (size + 1 < capacity()) {
+    if (size + 1 <= capacity()) {
      size_ = size;
    } else {
      MutableCPU();

--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -104,3 +104,11 @@ TEST(mixed_vector, ForEach) {
  for (auto& v : tmp) {
  }
 }
+TEST(mixed_vector, Reserve) {
+  paddle::framework::Vector<int> vec;
+  vec.reserve(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -117,10 +117,10 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
-  PADDLE_ENFORCE_GT(
+  PADDLE_ENFORCE_GE(numel(), 0,
-      numel(), 0,
+                    "When calling this method, the Tensor's numel must be "
-      "When calling this method, the Tensor's numel must be larger than zero. "
+                    "equal or larger than zero. "
-      "Please check Tensor::Resize has been called first.");
+                    "Please check Tensor::Resize has been called first.");
  int64_t size = numel() * SizeOfType(type);
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -13,6 +13,11 @@ cc_library(paddle_fluid_shared SHARED
    SRCS io.cc
    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
+if(NOT APPLE)
+  # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
+  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
+  set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
 if(WITH_TESTING)
  add_subdirectory(tests/book)

--- a/paddle/fluid/inference/paddle_fluid.map
+++ b/paddle/fluid/inference/paddle_fluid.map
+{
+	global:
+		*paddle*;
+	local:
+		*;
+};
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
  EXPECT_EQ(total_size, 0UL);
  for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
    ps[paddle::memory::Alloc(cpu, size)] = size;
    // Buddy Allocator doesn't manage too large memory chunk
@@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
  EXPECT_EQ(total_size, 0UL);
  for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
    ps[paddle::memory::Alloc(gpu, size)] = size;
    // Buddy Allocator doesn't manage too large memory chunk

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -12,6 +12,8 @@ function(op_library TARGET)
    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
    set(cc_srcs)
    set(cu_srcs)
+    set(hip_cu_srcs)
+    set(miopen_hip_cc_srcs)
    set(cu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(CUDNN_FILE)
@@ -36,10 +38,19 @@ function(op_library TARGET)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
            list(APPEND cu_srcs ${TARGET}.cu)
        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
+        endif()
        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
        endif()
+        if(WITH_AMD_GPU)
+            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
+                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
+            endif()
+        endif()
        if(WITH_MKLDNN)
            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
@@ -48,10 +59,14 @@ function(op_library TARGET)
        endif()
    else()
        foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.cu$")
+            if (${src} MATCHES ".*\\.hip.cu$")
+                list(APPEND hip_cu_srcs ${src})
+            elseif (${src} MATCHES ".*\\.cu$")
                list(APPEND cu_srcs ${src})
            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
+                list(APPEND miopen_hip_cc_srcs ${src})
            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
@@ -76,6 +91,9 @@ function(op_library TARGET)
    if (WITH_GPU)
        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
    else()
        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
            ${op_common_deps})
@@ -88,7 +106,7 @@ function(op_library TARGET)
        endif()
    endforeach()
-    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
    file(READ ${TARGET}.cc TARGET_CONTENT)
@@ -114,7 +132,10 @@ function(op_library TARGET)
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
+    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@@ -125,9 +146,19 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
    endif()
+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
+    endif()
    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+      # Append first implemented MKLDNN activation operator
+      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      else()
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+      endif()
    endif()
    # pybind USE_OP
@@ -156,9 +187,13 @@ if(WITH_DISTRIBUTE)
    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
 endif()
 op_library(cond_op DEPS framework_proto tensor net_op)

--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "mkldnn.hpp"
+#include "mkldnn_activation_op.h"
+#include "paddle/fluid/operators/activation_op.h"
+namespace paddle {
+namespace operators {
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+namespace {
+template <typename T, typename ExecContext>
+void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
+                     const T alpha = 0, const T beta = 0) {
+  PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                 "It must use CPUPlace.");
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+  // get buffers
+  const auto *src = ctx.template Input<Tensor>("X");
+  const auto *src_data = src->template data<T>();
+  auto *dst = ctx.template Output<Tensor>("Out");
+  const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+  // get memory dim
+  PADDLE_ENFORCE(src->dims().size() == 4,
+                 "Input dim must be with 4, i.e. NCHW");
+  std::vector<int> src_tz = framework::vectorize2int(src->dims());
+  // create memory description
+  // TODO(kbinias-intel): support more formats
+  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                         mkldnn::memory::format::nchw);
+  // create memory primitives
+  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data);
+  auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data);
+  auto forward_desc = mkldnn::eltwise_forward::desc(
+      mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
+  // save prim desc into global device context to be referred in backward path
+  const std::string key = ctx.op().Output("Out");
+  const std::string key_eltwise_pd = key + "@eltwise_pd";
+  auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+      forward_desc, mkldnn_engine);
+  dev_ctx.SetBlob(key_eltwise_pd, forward_pd);
+  auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory);
+  // push primitive to stream and wait until it's executed
+  std::vector<mkldnn::primitive> pipeline = {eltwise};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+template <typename T, typename ExecContext>
+void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
+                  const T alpha = 0, const T beta = 0) {
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+  // get buffers
+  const auto *x = ctx.template Input<Tensor>("X");
+  const auto *src = x->template data<T>();
+  auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+  const auto *diff_dst = dout->template data<T>();
+  auto *dx =
+      ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+  const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
+  // get memory dim
+  std::vector<int> src_tz = framework::vectorize2int(x->dims());
+  // create memory description
+  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                         mkldnn::memory::format::nchw);
+  // create memory primitives
+  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src);
+  auto diff_src_memory =
+      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src);
+  auto diff_dst_memory =
+      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst);
+  auto backward_desc =
+      mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
+  // retrieve eltwise primitive desc from device context
+  const std::string key = ctx.op().Input("Out");
+  const std::string key_eltwise_pd = key + "@eltwise_pd";
+  const std::shared_ptr<void> forward_pd = dev_ctx.GetBlob(key_eltwise_pd);
+  PADDLE_ENFORCE(forward_pd != nullptr,
+                 "Fail to find eltwise_pd in device context");
+  auto *p_forward_pd =
+      static_cast<mkldnn::eltwise_forward::primitive_desc *>(forward_pd.get());
+  auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
+      backward_desc, mkldnn_engine, *p_forward_pd);
+  auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory,
+                                              diff_dst_memory, diff_src_memory);
+  // push primitive to stream and wait until it's executed
+  std::vector<mkldnn::primitive> pipeline = {eltwise_bwd};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+}  // anonymous namespace
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
+  template <typename ExecContext>
+  void operator()(const ExecContext &ctx) const {
+    eltwise_forward<T>(ctx, algorithm);
+  }
+};
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
+  template <typename ExecContext>
+  void operator()(const ExecContext &ctx) const {
+    eltwise_grad<T>(ctx, algorithm);
+  }
+};
+template <typename T>
+using ReluMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
+template <typename T>
+using TanhMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
+template <typename T>
+using SqrtMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+template <typename T>
+using AbsMkldnnFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
+template <typename T>
+using ReluMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
+template <typename T>
+using TanhMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
+template <typename T>
+using SqrtMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+template <typename T>
+using AbsMkldnnGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
+  REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace,       \
+                     ops::MKLDNNActivationKernel<ops::functor<float>>);    \
+  REGISTER_OP_KERNEL(                                                      \
+      act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,               \
+      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
+#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)            \
+  __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
+  __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
+  __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
+  __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
+FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/mkldnn_activation_op.h"
 namespace paddle {
 namespace operators {
@@ -87,6 +88,9 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Relu operator");
    AddOutput("Out", "Output of Relu operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Relu Activation Operator.
@@ -140,6 +144,9 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Tanh operator");
    AddOutput("Out", "Output of Tanh operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Tanh Activation Operator.
@@ -193,6 +200,9 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Sqrt operator");
    AddOutput("Out", "Output of Sqrt operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Sqrt Activation Operator.
@@ -208,6 +218,9 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Abs operator");
    AddOutput("Out", "Output of Abs operator");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Abs Activation Operator.
@@ -524,11 +537,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
 REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
            ops::ActivationOpGrad);
-REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
+REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad,
-            ops::ActivationOpGrad);
+            ops::ActivationWithMKLDNNOpGrad);
-REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
+REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad,
-            ops::ActivationOpGrad);
+            ops::ActivationWithMKLDNNOpGrad);
 REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
            tanh_shrink_grad, ops::ActivationOpGrad);
@@ -536,11 +549,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
 REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
            softshrink_grad, ops::ActivationOpGrad);
-REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
+REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad,
-            ops::ActivationOpGrad);
+            ops::ActivationWithMKLDNNOpGrad);
-REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
+REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad,
-            ops::ActivationOpGrad);
+            ops::ActivationWithMKLDNNOpGrad);
 REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
            ops::ActivationOpGrad);
@@ -613,3 +626,14 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
                                ops::grad_functor<double>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
+REGISTER_OP_CPU_KERNEL(relu,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ReluFunctor<float>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ReluFunctor<double>>);
+REGISTER_OP_CPU_KERNEL(
+    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                         ops::ReluGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::ReluGradFunctor<double>>);
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
@@ -31,3 +32,16 @@ namespace ops = paddle::operators;
                                ops::grad_functor<double>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
+REGISTER_OP_CUDA_KERNEL(
+    relu, ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                                ops::ReluFunctor<float>>,
+    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                          ops::ReluFunctor<double>>,
+    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                          ops::ReluFunctor<paddle::platform::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    relu_grad, ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                         ops::ReluGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::ReluGradFunctor<double>>);
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,6 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -772,7 +776,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
-  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \

--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/average_accumulates_op.h"
+namespace paddle {
+namespace operators {
+template <>
+void GetAccumulators<paddle::platform::CPUDeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t& num_updates_,
+    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
+  old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
+  num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
+  num_updates_ = in_num_updates->data<int64_t>()[0];
+}
+template <>
+void SetAccumulators<paddle::platform::CPUDeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t num_updates_,
+    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
+  out_num_updates->data<int64_t>()[0] = num_updates_;
+}
+class AverageAccumulatesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("param"),
+        "Input (param) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_1"),
+        "Input (sum_1) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_2"),
+        "Input (sum_2) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_3"),
+        "Input (sum_3) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_num_accumulates"),
+        "Input (in_num_accumulates) of average_accumulates op should "
+        "not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
+                   "Input (old_num_accumulates) of average_accumulates op "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_num_updates"),
+        "Input (num_updates) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_1"),
+        "Output (sum_1) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_2"),
+        "Output (sum_2) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_3"),
+        "Output (sum_3) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
+                   "Output (num_accumulates) of average_accumulates op should "
+                   "not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
+                   "Output (old_num_accumulates) of average_accumulates op "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_num_updates"),
+        "Output (num_updates) of average_accumulates op should not be null.");
+    auto in_dim = ctx->GetInputDim("param");
+    ctx->SetOutputDim("out_sum_1", in_dim);
+    ctx->SetOutputDim("out_sum_2", in_dim);
+    ctx->SetOutputDim("out_sum_3", in_dim);
+    ctx->SetOutputDim("out_num_accumulates", {1});
+    ctx->SetOutputDim("out_old_num_accumulates", {1});
+    ctx->SetOutputDim("out_num_updates", {1});
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("param")->type()),
+        ctx.GetPlace());
+  }
+};
+class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("param", "(Tensor), The parameter to be accumulated.");
+    AddInput("in_sum_1",
+             "(Tensor), A tensor used to store the parameter "
+             "sums with the same shape as input(param).");
+    AddInput("in_sum_2",
+             "(Tensor), A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param). It is used to avoid loss of precision due to too "
+             "many sums.");
+    AddInput("in_sum_3",
+             "(Tensor), A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param).");
+    AddInput("in_num_accumulates",
+             "(Tensor<int64_t>), The accumulating times of current window with "
+             "shape [1].");
+    AddInput(
+        "in_old_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of previous window with "
+        "shape [1].");
+    AddInput("in_num_updates",
+             "(Tensor<int64_t>), The total number of batches used by trainning "
+             "before this batch with shape [1].");
+    AddOutput("out_sum_1",
+              "(Tensor), A tensor used to store the "
+              "parameter sums with the same shape as input(param).");
+    AddOutput("out_sum_2",
+              "(Tensor), A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param). It is used to avoid loss of precision due to too "
+              "many sums.");
+    AddOutput("out_sum_3",
+              "(Tensor), A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param).");
+    AddOutput(
+        "out_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of current window with "
+        "shape [1].");
+    AddOutput(
+        "out_old_num_accumulates",
+        "(Tensor<int64_t>) The accumulating times of previous window with "
+        "shape [1].");
+    AddOutput(
+        "out_num_updates",
+        "(Tensor<int64_t>), The total number of batches used by trainning "
+        "before this batch with shape [1].");
+    AddAttr<float>("average_window",
+                   "(float, default 0) "
+                   "The rate of average window size relative to num_updates.")
+        .SetDefault(0);
+    AddAttr<int64_t>("max_average_window",
+                     "(int64_t) "
+                     "Maximum size of average window. It suggests that the "
+                     "number of mini-batches "
+                     "in one pass is appropriate value to set.");
+    AddAttr<int64_t>("min_average_window",
+                     "(int64_t, default 10000L) "
+                     "Minimu size of average window.")
+        .SetDefault(10000L);
+    AddComment(R"DOC(
+AverageAccumulates Operator.
+Accumulate the sum of parameter whtin sliding window. The size of sliding window is
+determined by 'average_window', 'max_average_window' and 'min_average_window'.
+Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
+'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
+All the accumulators were inited to zero before training.
+And for a mini-batch in training, accumulators were computed as below steps:
+    num_updates += 1
+    num_accumulates += 1
+    sum_1 += param
+    if num_updates % kMaxNumAccumulates == 0:
+        sum_2 += sum_1
+        sum_1 = 0
+    if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
+        sum_3 = sum_1 + sum_2
+        sum_1 = 0
+        sum_2 = 0
+        old_num_accumulates = num_accumulates
+        num_accumulates = 0
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
+                  ops::AverageAccumulatesOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    average_accumulates,
+    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/average_accumulates_op.h"
+#include "paddle/fluid/platform/gpu_info.h"
+namespace paddle {
+namespace operators {
+template <>
+void GetAccumulators<paddle::platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t& num_updates_,
+    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
+  auto stream = ctx.cuda_device_context().stream();
+  memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
+               platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
+               sizeof(int64_t), stream);
+  memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
+               in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
+  memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
+               in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
+}
+template <>
+void SetAccumulators<paddle::platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t num_updates_,
+    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+  auto stream = ctx.cuda_device_context().stream();
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+  memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
+               platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
+               platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
+               platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
+}
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    average_accumulates,
+    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename DeviceContext>
+void GetAccumulators(const framework::ExecutionContext& ctx,
+                     int64_t& num_updates, int64_t& num_accumulates,
+                     int64_t& old_num_accumulates);
+template <typename DeviceContext>
+void SetAccumulators(const framework::ExecutionContext& ctx,
+                     int64_t num_updates, int64_t num_accumulates,
+                     int64_t old_num_accumulates);
+template <typename DeviceContext, typename T>
+class AverageAccumulatesKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // It is used to avoid loss of precision
+    static const int64_t kMaxNumAccumulates = 16384;
+    // Get accumulators from input
+    int64_t num_updates = 0;
+    int64_t num_accumulates = 0;
+    int64_t old_num_accumulates = 0;
+    GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+                                   old_num_accumulates);
+    // Get attrs
+    float average_window = ctx.Attr<float>("average_window");
+    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
+    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
+    min_average_window =
+        std::min<int64_t>(min_average_window, max_average_window);
+    // Get inputs
+    auto* param = ctx.Input<Tensor>("param");
+    auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
+    auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
+    auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
+    auto param_tensor = EigenVector<T>::Flatten(*param);
+    auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
+    auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
+    auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
+    // Get outputs
+    auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
+    auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
+    auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+    auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
+    auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
+    auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
+    // Compute
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    math::SetConstant<DeviceContext, T> constant_functor;
+    ++num_updates;
+    ++num_accumulates;
+    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
+    out_sum_2_tensor.device(place) = in_sum_2_tensor;
+    out_sum_3_tensor.device(place) = in_sum_3_tensor;
+    if (num_updates % kMaxNumAccumulates == 0) {
+      // Move the sum to a different buffer to avoid loss of precision due to
+      // too many sums.
+      out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
+                       0.0);
+    }
+    if (num_accumulates >= min_average_window &&
+        num_accumulates >= std::min<int64_t>(max_average_window,
+                                             num_updates * average_window)) {
+      //  Now the average window is too long, discard the old sum.
+      out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
+                       0.0);
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
+                       0.0);
+      old_num_accumulates = num_accumulates;
+      num_accumulates = 0;
+    }
+    // Set accumulators to output
+    SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+                                   old_num_accumulates);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -80,6 +80,29 @@ class BatchNormOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("SavedVariance", {C});
    ctx->ShareLoD("X", "Y");
  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    // For float or float16 input tensor, the type of the scale, bias, mean,
+    // and var tensors should both be float.
+    auto bn_param_type = framework::proto::VarType::FP32;
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::ToDataType(ctx.Input<Tensor>("Scale")->type()),
+                      "Scale input should be of float type");
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::ToDataType(ctx.Input<Tensor>("Bias")->type()),
+                      "Bias input should be of float type");
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::ToDataType(ctx.Input<Tensor>("Mean")->type()),
+                      "Mean input should be of float type");
+    PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(
+                                         ctx.Input<Tensor>("Variance")->type()),
+                      "Variance input should be of float type");
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -434,12 +457,39 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
  }
 };
+class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("batch_norm_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetInput("Scale", Input("Scale"));
+    op->SetInput("SavedMean", Output("SavedMean"));
+    op->SetInput("SavedVariance", Output("SavedVariance"));
+    op->SetAttrMap(Attrs());
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
+REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-            batch_norm_grad, ops::BatchNormGradOp);
+                  ops::BatchNormGradMaker);
+REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
 REGISTER_OP_CPU_KERNEL(
    batch_norm,
    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cfloat>
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
@@ -26,6 +27,8 @@ using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
                  int *N, int *C, int *H, int *W, int *D) {
@@ -104,8 +107,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
        data_desc_, CudnnDataType<T>::type,
        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+    // Note: PERSISTENT not implemented for inference
    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
+        bn_param_desc_, data_desc_, is_test ? CUDNN_BATCHNORM_SPATIAL : mode_));
    const auto *scale = ctx.Input<Tensor>("Scale");
    const auto *bias = ctx.Input<Tensor>("Bias");
@@ -118,15 +122,16 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    // alloc memory
    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_mean->mutable_data<T>(ctx.GetPlace());
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> functor;
+    math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-    functor(dev_ctx, saved_mean, 0);
+        functor;
-    functor(dev_ctx, saved_variance, 0);
+    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
    auto handle = dev_ctx.cudnn_handle();
@@ -147,8 +152,10 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
          CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
          CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
          data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
-          bn_param_desc_, scale->template data<T>(), bias->template data<T>(),
+          bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-          est_mean->template data<T>(), est_var->template data<T>(), epsilon));
+          bias->template data<BatchNormParamType<T>>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(), epsilon));
    } else {
      // Run training mode.
      // obtain running mean and running inv var, and see if we need to
@@ -159,11 +166,16 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
          handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
          data_desc_, x->template data<T>(), data_desc_,
          y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-          scale->template data<T>(), bias->template data<T>(), this_factor,
+          scale->template data<BatchNormParamType<T>>(),
-          mean_out->template mutable_data<T>(ctx.GetPlace()),
+          bias->template data<BatchNormParamType<T>>(), this_factor,
-          variance_out->template mutable_data<T>(ctx.GetPlace()), epsilon,
+          mean_out->template mutable_data<BatchNormParamType<T>>(
-          saved_mean->template mutable_data<T>(ctx.GetPlace()),
+              ctx.GetPlace()),
-          saved_variance->template mutable_data<T>(ctx.GetPlace())));
+          variance_out->template mutable_data<BatchNormParamType<T>>(
+              ctx.GetPlace()),
+          epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                       ctx.GetPlace()),
+          saved_variance->template mutable_data<BatchNormParamType<T>>(
+              ctx.GetPlace())));
    }
    // clean when exit.
@@ -270,9 +282,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 }  // namespace paddle
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    batch_norm,
+    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad,
+    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>);
-    ops::BatchNormGradKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/box_coder_op.cc
+++ b/paddle/fluid/operators/box_coder_op.cc
@@ -126,6 +126,7 @@ width and height.
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
+REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
                       ops::BoxCoderKernel<double>);
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
@@ -88,4 +89,5 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                       ops::CastOpKernel<CPU, double>,
                       ops::CastOpKernel<CPU, int>,
                       ops::CastOpKernel<CPU, int64_t>,
-                       ops::CastOpKernel<CPU, bool>);
+                       ops::CastOpKernel<CPU, bool>,
+                       ops::CastOpKernel<CPU, paddle::platform::float16>);
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/platform/float16.h"
 template <typename T>
 using CastOpKernel =
@@ -20,4 +21,5 @@ using CastOpKernel =
 REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
                        CastOpKernel<int>, CastOpKernel<int64_t>,
-                        CastOpKernel<bool>);
+                        CastOpKernel<bool>,
+                        CastOpKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -28,6 +28,8 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
+template <typename T>
+using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
    static_cast<size_t>(1024) * 1024 * 1024;
@@ -134,8 +136,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
    // ------------------- cudnn conv forward ---------------------
-    typename platform::CudnnDataType<T>::ScalingParamType alpha = 1.0f,
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-                                                          beta = 0.0f;
    for (int i = 0; i < groups; i++) {
      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
          handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
@@ -282,8 +283,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
    // ------------------- cudnn conv backward data ---------------------
-    typename platform::CudnnDataType<T>::ScalingParamType alpha = 1.0f,
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-                                                          beta = 0.0f;
    if (input_grad) {
      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
      // Because beta is zero, it is unnecessary to reset input_grad.

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -70,16 +70,16 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library_{framework::LibraryType::kPlain};
+  framework::LibraryType library{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
  if (platform::CanCUDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kCUDNN;
+    library = framework::LibraryType::kCUDNN;
  }
 #endif
 #ifdef PADDLE_WITH_MKLDNN
-  if (library_ == framework::LibraryType::kPlain &&
+  if (library == framework::LibraryType::kPlain &&
      platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kMKLDNN;
+    library = framework::LibraryType::kMKLDNN;
  }
 #endif
@@ -91,15 +91,15 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
                    "input and filter data type should be consistent");
  if (input_data_type == framework::proto::VarType::FP16) {
-    PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
+    PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
                      "float16 can only be used when CUDNN is used");
  }
  std::string data_format = ctx.Attr<std::string>("data_format");
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  framework::DataLayout layout = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                 library_);
+                                 library);
 }
 Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)

--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -78,7 +78,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
      for (int64_t i = 0; i < batch_size; ++i) {
        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
        int64_t index = i * class_num + label_data[i];
-        dx_data[index] = -dy_data[i] / x_data[index];
+        dx_data[index] = math::TolerableValue<T>()(-dy_data[i] / x_data[index]);
      }
    }
  }

--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
 if(WITH_DISTRIBUTE)
-  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+      grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc)
+  cc_test(serde_test SRCS test_serde.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
+      cares zlib protobuf sendrecvop_grpc)
 endif()
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
@@ -23,9 +23,107 @@ limitations under the License. */
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
+namespace grpc {
+// A ZeroCopyInputStream that reads from grpc_byte_buffer
+class GrpcBufferReader final
+    : public ::google::protobuf::io::ZeroCopyInputStream {
+  typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
+      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
+  typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
+      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
+  void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
+                  grpc_byte_buffer* buffer) {
+    (g_core_codegen_interface->*ptr)(reader, buffer);
+  }
+  void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
+                  grpc_byte_buffer* buffer) {
+    int result = (g_core_codegen_interface->*ptr)(reader, buffer);
+    (void)result;
+  }
+ public:
+  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
+      : byte_count_(0), backup_count_(0) {
+    ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
+               buffer);
+  }
+  ~GrpcBufferReader() override {
+    g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
+  }
+  bool Next(const void** data, int* size) override {
+    if (backup_count_ > 0) {
+      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
+              backup_count_;
+      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
+      *size = (int)backup_count_;
+      backup_count_ = 0;
+      return true;
+    }
+    if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
+                                                                &slice_)) {
+      return false;
+    }
+    g_core_codegen_interface->grpc_slice_unref(slice_);
+    *data = GRPC_SLICE_START_PTR(slice_);
+    // On win x64, int is only 32bit
+    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
+    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
+    return true;
+  }
+  void BackUp(int count) override { backup_count_ = count; }
+  bool Skip(int count) override {
+    const void* data;
+    int size;
+    while (Next(&data, &size)) {
+      if (size >= count) {
+        BackUp(size - count);
+        return true;
+      }
+      // size < count;
+      count -= size;
+    }
+    // error or we have too large count;
+    return false;
+  }
+  ::google::protobuf::int64 ByteCount() const override {
+    return byte_count_ - backup_count_;
+  }
+ private:
+  int64_t byte_count_;
+  int64_t backup_count_;
+  grpc_byte_buffer_reader reader_;
+  grpc_slice slice_;
+};
+};  // namespace grpc
 namespace paddle {
 namespace operators {
 namespace detail {
+// Source provides a way for a particular RPC implementation to provide
+// received data to ParseFrom.
+class Source {
+ public:
+  virtual ~Source() {}
+  // Return the stream that contains the data to be parsed.
+  // Note that this method might be invoked more than once if
+  // ParseFrom needs to fall back to a more expensive parsing method.
+  // Every call must return a stream pointing at the beginning of
+  // the serialized RecvTensorResponse.
+  //
+  // Note that a subsequent call to contents() invalidates previous
+  // results of contents().
+  //
+  // Ownership of the returned stream is retained by the Source and
+  // should not be deleted by the caller.
+  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
+};
 // A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
 class GrpcByteBufferSource
@@ -46,6 +144,43 @@ class GrpcByteBufferSource
  ::google::protobuf::int64 byte_count_;
 };
+class GrpcByteBufferSourceWrapper : public Source {
+ public:
+  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
+      : source_(source) {}
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+    return source_;
+  }
+ private:
+  GrpcByteBufferSource* source_;
+};
+class GrpcByteSource : public Source {
+ public:
+  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
+  ~GrpcByteSource() override { DeleteStream(); }
+  typedef ::grpc::GrpcBufferReader Reader;
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+    DeleteStream();
+    stream_ = new (&space_) Reader(buffer_);
+    return stream_;
+  }
+ private:
+  void DeleteStream() {
+    if (stream_) {
+      stream_->~Reader();
+    }
+  }
+  grpc_byte_buffer* buffer_;  // Not owned
+  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
+  char space_[sizeof(Reader)];
+};
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "grpc_client.h"
+#include <sys/time.h>
 #include "paddle/fluid/framework/threadpool.h"
 namespace paddle {
 namespace operators {
 namespace detail {
@@ -31,8 +33,9 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
    auto* var = p_scope->FindVar(var_name_val);
-    sendrecv::VariableMessage req;
-    SerializeToMessage(var_name_val, var, *p_ctx, &req);
+    ::grpc::ByteBuffer req;
+    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
    // varhandle
    VarHandle var_h;
@@ -46,8 +49,10 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
    s->Prepare(var_h, time_out);
    s->response_call_back_ = NULL;
-    auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+    auto call = s->stub_g_.PrepareUnaryCall(
-    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, (void*)s);
  });
  req_count_++;
@@ -56,9 +61,19 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
 }
 void ProcGetResponse(const VarHandle& var_h,
-                     const sendrecv::VariableMessage& ret_msg) {
+                     // const sendrecv::VariableMessage& ret_msg) {
-  auto* outvar = var_h.scope->FindVar(var_h.name);
+                     const ::grpc::ByteBuffer& ret_msg) {
-  DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
+  framework::Variable* outvar = NULL;
+  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, outvar);
+}
+template <typename T>
+void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
+  ::grpc::Slice slice(proto.ByteSizeLong());
+  proto.SerializeWithCachedSizesToArray(
+      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(slice.begin())));
+  ::grpc::ByteBuffer tmp(&slice, 1);
+  result->Swap(&tmp);
 }
 bool RPCClient::AsyncGetVariable(const std::string& ep,
@@ -88,8 +103,13 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
    s->Prepare(var_h, time_out);
    s->response_call_back_ = ProcGetResponse;
-    auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+    ::grpc::ByteBuffer buf;
-    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, (void*)s);
  });
  req_count_++;

--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -25,6 +25,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
+#include <grpc++/generic/generic_stub.h>
+#include <grpc++/grpc++.h>
+#include <grpc++/support/byte_buffer.h>
+#include <grpc++/support/slice.h>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -49,15 +54,11 @@ struct VarHandle {
  }
 };
-void ProcGetResponse(const VarHandle& var_h,
+void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
-                     const sendrecv::VariableMessage& msg);
 class BaseProcessor {
 public:
-  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
+  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) { context_ = NULL; }
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-    context_ = NULL;
-  }
  virtual ~BaseProcessor() {}
@@ -82,19 +83,18 @@ class BaseProcessor {
  virtual void Process() = 0;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
  std::unique_ptr<grpc::ClientContext> context_;
  grpc::Status status_;
  VarHandle var_h_;
 };
-typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)>
+typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
    RequestSendCallBack;
 class SendProcessor : public BaseProcessor {
 public:
  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
+      : BaseProcessor(ch), stub_g_(ch) {}
  virtual ~SendProcessor() {}
@@ -104,17 +104,18 @@ class SendProcessor : public BaseProcessor {
    }
  }
-  sendrecv::VoidMessage reply_;
+  ::grpc::GenericStub stub_g_;
+  ::grpc::ByteBuffer reply_;
  RequestSendCallBack response_call_back_ = NULL;
 };
-typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)>
+typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
    RequestGetCallBack;
 class GetProcessor : public BaseProcessor {
 public:
  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
+      : BaseProcessor(ch), stub_g_(ch) {}
  virtual ~GetProcessor() {}
@@ -124,30 +125,37 @@ class GetProcessor : public BaseProcessor {
    }
  }
-  sendrecv::VariableMessage reply_;
+  ::grpc::ByteBuffer reply_;
+  ::grpc::GenericStub stub_g_;
  RequestGetCallBack response_call_back_ = ProcGetResponse;
 };
 class BatchBarrierProcessor : public BaseProcessor {
 public:
  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
+      : BaseProcessor(ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+  }
  virtual ~BatchBarrierProcessor() {}
  virtual void Process() {}
  sendrecv::VoidMessage reply_;
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
 class FetchBarrierProcessor : public BaseProcessor {
 public:
  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
+      : BaseProcessor(ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+  }
  virtual ~FetchBarrierProcessor() {}
  virtual void Process() {}
  sendrecv::VariableMessage reply_;
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
 class RPCClient {

--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/grpc_server.h"
-using grpc::ServerAsyncResponseWriter;
+using ::grpc::ServerAsyncResponseWriter;
 namespace paddle {
 namespace operators {
@@ -26,9 +26,10 @@ enum CallStatus { PROCESS = 0, FINISH };
 // https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
 class RequestBase {
 public:
-  explicit RequestBase(sendrecv::SendRecvService::AsyncService* service,
+  explicit RequestBase(GrpcService::AsyncService* service,
-                       grpc::ServerCompletionQueue* cq)
+                       ::grpc::ServerCompletionQueue* cq,
-      : service_(service), cq_(cq), status_(PROCESS) {
+                       const platform::DeviceContext* dev_ctx)
+      : service_(service), cq_(cq), status_(PROCESS), dev_ctx_(dev_ctx) {
    PADDLE_ENFORCE(cq_);
  }
  virtual ~RequestBase() {}
@@ -42,55 +43,58 @@ class RequestBase {
  }
 protected:
-  grpc::ServerContext ctx_;
+  ::grpc::ServerContext ctx_;
-  sendrecv::SendRecvService::AsyncService* service_;
+  GrpcService::AsyncService* service_;
-  grpc::ServerCompletionQueue* cq_;
+  ::grpc::ServerCompletionQueue* cq_;
  CallStatus status_;
+  const platform::DeviceContext* dev_ctx_;
 };
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
 class RequestSend final : public RequestBase {
 public:
-  explicit RequestSend(sendrecv::SendRecvService::AsyncService* service,
+  explicit RequestSend(GrpcService::AsyncService* service,
-                       grpc::ServerCompletionQueue* cq,
+                       ::grpc::ServerCompletionQueue* cq,
-                       SimpleBlockQueue<MessageWithName>* queue)
+                       framework::Scope* scope, ReceivedQueue* queue,
-      : RequestBase(service, cq), queue_(queue), responder_(&ctx_) {
+                       const platform::DeviceContext* dev_ctx)
-    service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_,
+      : RequestBase(service, cq, dev_ctx), queue_(queue), responder_(&ctx_) {
-                                  this);
+    request_.reset(new VariableResponse(scope, dev_ctx_));
+    int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
+    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
+                                cq_, cq_, this);
  }
  virtual ~RequestSend() {}
-  virtual std::string GetReqName() { return request_.varname(); }
+  virtual std::string GetReqName() { return request_->Varname(); }
  virtual void Process() {
-    MessageWithName msg_with_name =
+    queue_->Push(std::make_pair(request_->Varname(), request_));
-        std::make_pair(request_.varname(), std::move(request_));
-    queue_->Push(std::move(msg_with_name));
+    sendrecv::VoidMessage reply;
-    responder_.Finish(reply_, grpc::Status::OK, this);
+    responder_.Finish(reply, ::grpc::Status::OK, this);
    status_ = FINISH;
  }
 protected:
-  sendrecv::VariableMessage request_;
+  std::shared_ptr<VariableResponse> request_;
-  sendrecv::VoidMessage reply_;
+  ReceivedQueue* queue_;
-  SimpleBlockQueue<MessageWithName>* queue_;
  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
 class RequestGet final : public RequestBase {
 public:
-  explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
+  explicit RequestGet(GrpcService::AsyncService* service,
-                      grpc::ServerCompletionQueue* cq, framework::Scope* scope,
+                      ::grpc::ServerCompletionQueue* cq,
+                      framework::Scope* scope,
                      const platform::DeviceContext* dev_ctx,
                      SimpleBlockQueue<MessageWithName>* queue)
-      : RequestBase(service, cq),
+      : RequestBase(service, cq, dev_ctx),
        responder_(&ctx_),
        scope_(scope),
-        dev_ctx_(dev_ctx),
        queue_(queue) {
-    service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
+    int method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
+    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
+                                cq_, this);
  }
  virtual ~RequestGet() {}
@@ -101,24 +105,26 @@ class RequestGet final : public RequestBase {
    // proc request.
    std::string var_name = request_.varname();
    auto* var = scope_->FindVar(var_name);
+    ::grpc::ByteBuffer reply;
    if (var_name != FETCH_BARRIER_MESSAGE) {
-      SerializeToMessage(var_name, var, *dev_ctx_, &reply_);
+      SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
    }
-    // TODO(gongwb): check var's info.
-    responder_.Finish(reply_, grpc::Status::OK, this);
+    responder_.Finish(reply, ::grpc::Status::OK, this);
    status_ = FINISH;
-    MessageWithName msg_with_name =
-        //          request name    reply
+    if (var_name == FETCH_BARRIER_MESSAGE) {
-        std::make_pair(var_name, std::move(reply_));
+      sendrecv::VariableMessage msg;
-    queue_->Push(msg_with_name);
+      MessageWithName msg_with_name = std::make_pair(var_name, msg);
+      queue_->Push(msg_with_name);
+    }
  }
 protected:
  sendrecv::VariableMessage request_;
-  sendrecv::VariableMessage reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
  framework::Scope* scope_;
-  const platform::DeviceContext* dev_ctx_;
  SimpleBlockQueue<MessageWithName>* queue_;
 };
@@ -133,8 +139,8 @@ void AsyncGRPCServer::WaitClientGet(int count) {
 }
 void AsyncGRPCServer::RunSyncUpdate() {
-  grpc::ServerBuilder builder;
+  ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
+  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials());
  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
  builder.RegisterService(&service_);
@@ -182,8 +188,8 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
  if (is_shut_down_) {
    return;
  }
-  RequestSend* send =
+  RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_,
-      new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
+                                      &var_recv_queue_, dev_ctx_);
  VLOG(4) << "Create RequestSend status:" << send->Status();
 }
@@ -198,7 +204,7 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
 }
 // FIXME(typhoonzero): change cq_name to enum.
-void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq,
+void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
                                    std::string cq_name,
                                    std::function<void()> TryToRegisterNewOne) {
  TryToRegisterNewOne();

--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -14,28 +14,31 @@ limitations under the License. */
 #pragma once
+#include <grpc++/grpc++.h>
+#include <thread>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
+#include "paddle/fluid/operators/detail/grpc_service.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
-#include <grpc++/grpc++.h>
-#include <grpc/support/log.h>
-#include <thread>
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/simple_block_queue.h"
 namespace paddle {
 namespace operators {
 namespace detail {
+typedef std::pair<std::string, std::shared_ptr<VariableResponse>>
+    ReceivedMessage;
+typedef SimpleBlockQueue<ReceivedMessage> ReceivedQueue;
 typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
 class RequestBase;
-class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
+class AsyncGRPCServer final {
 public:
  explicit AsyncGRPCServer(const std::string &address) : address_(address) {}
@@ -50,14 +53,16 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
  void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
-  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
+  const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
-  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
+  void Push(const std::string &msg_name) {
+    this->var_recv_queue_.Push(std::make_pair(msg_name, nullptr));
+  }
  void ShutDown();
 protected:
-  void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name,
+  void HandleRequest(::grpc::ServerCompletionQueue *cq, std::string cq_name,
                     std::function<void()> TryToRegisterNewOne);
  void TryToRegisterNewSendOne();
  void TryToRegisterNewGetOne();
@@ -66,18 +71,19 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
 private:
  std::mutex cq_mutex_;
  volatile bool is_shut_down_ = false;
-  std::unique_ptr<grpc::ServerCompletionQueue> cq_send_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
-  std::unique_ptr<grpc::ServerCompletionQueue> cq_get_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
-  sendrecv::SendRecvService::AsyncService service_;
+  GrpcService::AsyncService service_;
-  std::unique_ptr<grpc::Server> server_;
+  std::unique_ptr<::grpc::Server> server_;
  std::string address_;
  framework::Scope *scope_;
  const platform::DeviceContext *dev_ctx_;
  // received variable from RPC, operators fetch variable from this queue.
-  SimpleBlockQueue<MessageWithName> var_recv_queue_;
  SimpleBlockQueue<MessageWithName> var_get_queue_;
+  ReceivedQueue var_recv_queue_;
  // condition of the sub program
  std::mutex barrier_mutex_;

--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <grpc++/impl/codegen/async_stream.h>
+#include <grpc++/impl/codegen/async_unary_call.h>
+#include <grpc++/impl/codegen/proto_utils.h>
+#include <grpc++/impl/codegen/rpc_method.h>
+#include <grpc++/impl/codegen/service_type.h>
+#include <grpc++/impl/codegen/status.h>
+#include <grpc++/impl/codegen/stub_options.h>
+#include <grpc++/impl/codegen/sync_stream.h>
+#include <grpc++/support/byte_buffer.h>
+#include "paddle/fluid/operators/detail/variable_response.h"
+// NOTE: This method was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       method and did some modifications so that we can parse gRPC
+//       requests without too much copying of the tensor data.
+namespace grpc {
+class CompletionQueue;
+class Channel;
+class RpcService;
+class ServerCompletionQueue;
+class ServerContext;
+// Support parsing/unparsing of tensorflow::VariableResponse.
+// Wire-format is identical to RecvVariableResponse.
+template <>
+class SerializationTraits<paddle::operators::detail::VariableResponse> {
+ public:
+  static Status Serialize(
+      const paddle::operators::detail::VariableResponse& msg,
+      grpc_byte_buffer** bp, bool* own_buffer) {
+    PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
+    return Status();
+  }
+  static Status Deserialize(grpc_byte_buffer* buffer,
+                            paddle::operators::detail::VariableResponse* msg,
+                            int max_message_size = INT_MAX) {
+    if (buffer == nullptr) {
+      return Status(StatusCode::INTERNAL, "No payload");
+    }
+    Status result = g_core_codegen_interface->ok();
+    if (result.ok()) {
+      paddle::operators::detail::GrpcByteSource source(buffer);
+      int ret = msg->Parse(&source);
+      if (ret != 0) {
+        result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
+      }
+    }
+    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
+    return result;
+  }
+};
+}  // namespace grpc
+namespace paddle {
+namespace operators {
+namespace detail {
+enum class GrpcMethod {
+  kSendVariable,
+  kGetVariable,
+};
+static const int kGrpcNumMethods =
+    static_cast<int>(GrpcMethod::kGetVariable) + 1;
+inline const char* GrpcMethodName(GrpcMethod id) {
+  switch (id) {
+    case GrpcMethod::kSendVariable:
+      return "/sendrecv.SendRecvService/SendVariable";
+    case GrpcMethod::kGetVariable:
+      return "/sendrecv.SendRecvService/GetVariable";
+  }
+  // Shouldn't be reached.
+  PADDLE_ENFORCE(false, "Invalid id: not found valid method name");
+  return nullptr;
+}
+class GrpcService final {
+ public:
+  class AsyncService : public ::grpc::Service {
+   public:
+    AsyncService() {
+      for (int i = 0; i < kGrpcNumMethods; ++i) {
+        AddMethod(new ::grpc::internal::RpcServiceMethod(
+            GrpcMethodName(static_cast<GrpcMethod>(i)),
+            ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
+        ::grpc::Service::MarkMethodAsync(i);
+      }
+    }
+    virtual ~AsyncService() {}
+    // Make RequestAsyncUnary public for grpc_call.h
+    using ::grpc::Service::RequestAsyncUnary;
+  };
+};
+}  // namespace detail
+}  // namespace operator
+}  // namespace paddle
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -32,6 +32,9 @@ enum VarType {
  SELECTED_ROWS = 1;
 }
+// NOTICE(gongwb):don't modify this proto if you are not
+//   not familar with how we serialize in sendrecvop_utils.h
+//   and deserilize it in  variable_response.h.
 message VariableMessage {
  enum Type {
    // Pod Types
@@ -45,7 +48,6 @@ message VariableMessage {
  }
  message LodData { repeated int64 lod_data = 1; }
  string varname = 1;
  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
  VarType type = 2;
@@ -64,3 +66,5 @@ message VariableMessage {
 }
 message VoidMessage {}
+message TestMessage { int64 test_1 = 1; }
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -13,61 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include <sys/time.h>
+#include <thread>
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/detail/bytebuffer_stream.h"
 #include "paddle/fluid/operators/detail/proto_encoder_helper.h"
+#include "paddle/fluid/operators/detail/variable_response.h"
 namespace paddle {
 namespace operators {
 namespace detail {
-void SerializeToMessage(const std::string& name, const framework::Variable* var,
-                        const platform::DeviceContext& ctx,
-                        sendrecv::VariableMessage* msg) {
-  msg->set_varname(name);
-  std::ostringstream oss;
-  switch (framework::ToVarType(var->Type())) {
-    case framework::proto::VarType_Type_LOD_TENSOR:
-      msg->set_type(sendrecv::VarType::LOD_TENSOR);
-      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
-      break;
-    case framework::proto::VarType_Type_SELECTED_ROWS:
-      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
-      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
-                                   ctx);
-      break;
-    default: {
-      PADDLE_THROW("Serialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-  msg->set_serialized(oss.str());
-}
-void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
-                            const platform::DeviceContext& ctx,
-                            framework::Variable* var) {
-  std::istringstream iss(msg.serialized());
-  switch (msg.type()) {
-    case sendrecv::VarType::LOD_TENSOR:
-      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
-      break;
-    case sendrecv::VarType::SELECTED_ROWS: {
-      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
-                            ctx);
-      break;
-    }
-    default: {
-      PADDLE_THROW("Deserialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-}
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                           const platform::DeviceContext& ctx,
                           ::grpc::ByteBuffer* msg) {
@@ -123,6 +81,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
            static_cast<const platform::CUDADeviceContext&>(ctx);
        auto copy_size = tensor.memory_size();
        payload = memory::Alloc(cpu, copy_size);
        memory::Copy(cpu, payload,
                     boost::get<platform::CUDAPlace>(tensor.place()),
                     reinterpret_cast<const void*>(tensor.data<void>()),
@@ -132,6 +91,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
          platform::CPUPlace cpu;
          memory::Free(cpu, backing);
        };
 #endif
      } else {
        payload = tensor.data<void>();
@@ -219,80 +179,11 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const platform::DeviceContext& ctx,
-                               framework::Variable* var) {
+                               const framework::Scope* scope,
-  sendrecv::VariableMessage meta;
+                               framework::Variable*& var) {
-  GrpcByteBufferSource source;
+  operators::detail::VariableResponse resp(scope, &ctx);
-  source.Init(msg);
+  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
-  ::google::protobuf::io::CodedInputStream input(&source);
+  var = resp.GetVar();
-  // do zerocopy parsing
-  PADDLE_ENFORCE(meta.ParseFromCodedStream(&input));
-  PADDLE_ENFORCE(input.ConsumedEntireMessage());
-  // dims is needed by both tensor and selectedrows
-  std::vector<int> vecdims;
-  for (auto& d : meta.dims()) {
-    vecdims.push_back(d);
-  }
-  framework::DDim dims = framework::make_ddim(vecdims);
-  if (meta.type() == sendrecv::LOD_TENSOR) {
-    auto* tensor = var->GetMutable<framework::LoDTensor>();
-    tensor->Resize(dims);
-    void* tensor_data = tensor->mutable_data(
-        ctx.GetPlace(),
-        paddle::operators::detail::ToTypeIndex(meta.data_type()));
-    framework::LoD lod;
-    for (int i = 0; i < meta.lod_level(); ++i) {
-      framework::Vector<size_t> v;
-      for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) {
-        v.push_back(meta.lod(i).lod_data(j));
-      }
-      lod.push_back(v);
-    }
-    tensor->set_lod(lod);
-    // How to avoid copying and use the message buffer directly?
-    // Maybe need to find a way to release all memory except tensor content.
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      platform::CPUPlace cpu;
-      auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
-      memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
-                   tensor_data, cpu,
-                   reinterpret_cast<const void*>(meta.serialized().data()),
-                   meta.serialized().size(), gpu_dev_ctx.stream());
-      ctx.Wait();
-#endif
-    } else {
-      memcpy(tensor_data,
-             reinterpret_cast<const void*>(meta.serialized().data()),
-             meta.serialized().size());
-    }
-  } else if (meta.type() == sendrecv::SELECTED_ROWS) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    auto* tensor = slr->mutable_value();
-    int64_t* rows_data = slr->mutable_rows()->data();
-    tensor->Resize(dims);
-    void* tensor_data = tensor->mutable_data(
-        ctx.GetPlace(),
-        paddle::operators::detail::ToTypeIndex(meta.data_type()));
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      platform::CPUPlace cpu;
-      auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
-      memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
-                   tensor_data, cpu,
-                   reinterpret_cast<const void*>(meta.serialized().data()),
-                   meta.serialized().size(), gpu_dev_ctx.stream());
-      ctx.Wait();
-#endif
-    } else {
-      memcpy(tensor_data,
-             reinterpret_cast<const void*>(meta.serialized().data()),
-             meta.serialized().size());
-    }
-    // copy rows CPU data, GPU data will be copied lazly
-    memcpy(rows_data, reinterpret_cast<const void*>(meta.rows().data()),
-           meta.rows().size());
-  }
 }
 }  // namespace detail

--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
@@ -36,21 +37,14 @@ namespace detail {
 typedef void (*DestroyCallback)(void*);
-void SerializeToMessage(const std::string& name, const framework::Variable* var,
-                        const platform::DeviceContext& ctx,
-                        sendrecv::VariableMessage* msg);
-void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
-                            const platform::DeviceContext& ctx,
-                            framework::Variable* var);
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                           const platform::DeviceContext& ctx,
                           ::grpc::ByteBuffer* msg);
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const platform::DeviceContext& ctx,
-                               framework::Variable* var);
+                               const framework::Scope* scope,
+                               framework::Variable*& var);
 inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
  switch (type) {

--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/test_serde.cc
@@ -16,11 +16,13 @@ limitations under the License. */
 #include <string>
 #include <thread>
+#include <google/protobuf/text_format.h>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
@@ -31,19 +33,21 @@ namespace operators = paddle::operators;
 namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
-void RunSerdeTestTensor(platform::Place place) {
+void RunSerdeTestSelectedRows(platform::Place place) {
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* tensor = var.GetMutable<framework::LoDTensor>();
-  tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
-  framework::LoD lod;
-  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-  tensor->set_lod(lod);
-  int tensor_numel = 4 * 8 * 4 * 2;
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto& ctx = *pool.Get(place);
+  // serialize var to ByteBuffer
+  framework::Variable var;
+  auto* slr = var.GetMutable<framework::SelectedRows>();
+  auto* tensor = slr->mutable_value();
+  auto* rows = slr->mutable_rows();
+  tensor->Resize(framework::make_ddim({2, 10}));
  tensor->mutable_data<float>(place);
-  math::set_constant(ctx, tensor, 31.9);
+  int tensor_numel = 2 * 10;
+  math::set_constant(ctx, tensor, 32.7);
+  rows->push_back(3);
+  rows->push_back(10);
  ::grpc::ByteBuffer msg;
  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
@@ -56,62 +60,67 @@ void RunSerdeTestTensor(platform::Place place) {
  for (const auto& s : slices) {
    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
  }
  sendrecv::VariableMessage varmsg;
  EXPECT_TRUE(varmsg.ParseFromString(tmp));
  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 0);
+  EXPECT_EQ(varmsg.type(), 1);
-  EXPECT_EQ(varmsg.dims()[0], 4);
-  EXPECT_EQ(varmsg.dims()[1], 8);
-  EXPECT_EQ(varmsg.dims()[2], 4);
-  EXPECT_EQ(varmsg.dims()[3], 2);
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
  const float* tensor_data =
      reinterpret_cast<const float*>(varmsg.serialized().data());
+  const int64_t* rows_data =
+      reinterpret_cast<const int64_t*>(varmsg.rows().data());
  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
+    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
  }
+  EXPECT_EQ(rows_data[0], 3);
+  EXPECT_EQ(rows_data[1], 10);
  // deserialize zero-copy
-  framework::Variable var2;
+  // framework::Variable var2;
-  operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
+  // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
-  auto tensor2 = var2.Get<framework::LoDTensor>();
+  framework::Scope scope;
+  scope.Var("myvar");
+  operators::detail::VariableResponse resp(&scope, &ctx);
+  EXPECT_EQ(resp.Parse(msg), 0);
+  framework::Variable* var2 = resp.GetVar();
+  auto* slr2 = var2->GetMutable<framework::SelectedRows>();
+  auto* tensor2 = slr2->mutable_value();
+  auto* rows2 = slr2->mutable_rows();
  float* tensor_data2 = nullptr;
  framework::Tensor tmp_tensor;
  if (platform::is_gpu_place(ctx.GetPlace())) {
    platform::CPUPlace cpu;
-    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
+    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
    tensor_data2 = tmp_tensor.data<float>();
  } else {
-    tensor_data2 = const_cast<float*>(tensor2.data<float>());
+    tensor_data2 = const_cast<float*>(tensor2->data<float>());
  }
+  const int64_t* rows_data2 = rows2->data();
-  EXPECT_EQ(varmsg.lod_level(), 1);
+  for (int i = 0; i < tensor_numel; ++i) {
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
+    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
+  }
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
+  EXPECT_EQ(rows_data2[0], 3);
-  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
+  EXPECT_EQ(rows_data2[1], 10);
 }
-void RunSerdeTestSelectedRows(platform::Place place) {
+void RunTestLodTensor(platform::Place place, int from_type = 0) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
  // serialize var to ByteBuffer
  framework::Variable var;
-  auto* slr = var.GetMutable<framework::SelectedRows>();
+  auto* tensor = var.GetMutable<framework::LoDTensor>();
-  auto* tensor = slr->mutable_value();
+  tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
-  auto* rows = slr->mutable_rows();
+  framework::LoD lod;
-  tensor->Resize(framework::make_ddim({2, 10}));
+  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
+  tensor->set_lod(lod);
+  int tensor_numel = 4 * 8 * 4 * 2;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
  tensor->mutable_data<float>(place);
-  int tensor_numel = 2 * 10;
+  math::set_constant(ctx, tensor, 31.9);
-  math::set_constant(ctx, tensor, 32.7);
-  rows->push_back(3);
-  rows->push_back(10);
  ::grpc::ByteBuffer msg;
  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
@@ -126,61 +135,82 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  }
  sendrecv::VariableMessage varmsg;
  EXPECT_TRUE(varmsg.ParseFromString(tmp));
  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 1);
+  EXPECT_EQ(varmsg.type(), 0);
+  EXPECT_EQ(varmsg.dims()[0], 4);
+  EXPECT_EQ(varmsg.dims()[1], 8);
+  EXPECT_EQ(varmsg.dims()[2], 4);
+  EXPECT_EQ(varmsg.dims()[3], 2);
+  EXPECT_EQ(varmsg.lod_level(), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
+  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
  const float* tensor_data =
      reinterpret_cast<const float*>(varmsg.serialized().data());
-  const int64_t* rows_data =
-      reinterpret_cast<const int64_t*>(varmsg.rows().data());
  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
+    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
  }
-  EXPECT_EQ(rows_data[0], 3);
-  EXPECT_EQ(rows_data[1], 10);
+  // message binary
+  std::string str;
+  varmsg.SerializeToString(&str);
+  // message bytebuffer
+  ::grpc::Slice slices_2[1];
+  int num_slices = 1;
+  slices_2[0] = ::grpc::Slice(str.length());
+  memcpy(const_cast<uint8_t*>(slices_2[0].begin()), str.c_str(), str.length());
+  ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices);
  // deserialize zero-copy
-  framework::Variable var2;
+  framework::Scope scope;
-  operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
+  scope.Var("myvar");
+  operators::detail::VariableResponse resp(&scope, &ctx);
+  if (from_type == 0) {
+    EXPECT_EQ(resp.Parse(msg), 0);
+  } else {
+    EXPECT_EQ(resp.Parse(bytebuffer2), 0);
+  }
-  auto* slr2 = var2.GetMutable<framework::SelectedRows>();
+  framework::Variable* var2 = resp.GetVar();
-  auto* tensor2 = slr2->mutable_value();
-  auto* rows2 = slr2->mutable_rows();
+  auto tensor2 = var2->Get<framework::LoDTensor>();
  float* tensor_data2 = nullptr;
  framework::Tensor tmp_tensor;
  if (platform::is_gpu_place(ctx.GetPlace())) {
    platform::CPUPlace cpu;
-    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
+    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
    tensor_data2 = tmp_tensor.data<float>();
  } else {
-    tensor_data2 = const_cast<float*>(tensor2->data<float>());
+    tensor_data2 = const_cast<float*>(tensor2.data<float>());
  }
-  const int64_t* rows_data2 = rows2->data();
-  for (int i = 0; i < tensor_numel; ++i) {
+  EXPECT_EQ(varmsg.lod_level(), 1);
-    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
+  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  }
+  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(rows_data2[0], 3);
+  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-  EXPECT_EQ(rows_data2[1], 10);
+  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
 }
-TEST(SelectedRows, CPU) {
+TEST(LodTensor, Run) {
  platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
+  RunTestLodTensor(place);
+  RunTestLodTensor(place, 1);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu(0);
+  RunTestLodTensor(gpu);
+  RunTestLodTensor(gpu, 1);
+#endif
 }
-TEST(SelectedRows, GPU) {
+TEST(SelectedRows, Run) {
-  platform::CUDAPlace place;
+  platform::CPUPlace place;
  RunSerdeTestSelectedRows(place);
-}
-TEST(Tensor, CPU) {
+#ifdef PADDLE_WITH_CUDA
-  platform::CPUPlace place;
+  platform::CUDAPlace gpu;
-  RunSerdeTestTensor(place);
+  RunSerdeTestSelectedRows(gpu);
+#endif
 }
-TEST(Tensor, GPU) {
-  platform::CUDAPlace place;
-  RunSerdeTestTensor(place);
-}
\ No newline at end of file
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/detail/variable_response.h"
+#include <string.h>
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+enum WireType {
+  WIRETYPE_VARINT = 0,
+  WIRETYPE_LENGTH_DELIMITED = 2,
+};
+inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
+inline WireType GetTagWireType(uint32_t tag) {
+  return static_cast<WireType>(tag & 0x7);
+}
+bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
+                         int* result) {
+  uint64_t v;
+  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
+    *result = static_cast<int>(v);
+    return true;
+  } else {
+    return false;
+  }
+}
+bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
+             const platform::DeviceContext& dev_ctx, platform::Place place,
+             void* dest, int size) {
+  const void* data = NULL;
+  int size_to_write = 0;
+  if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+    auto& gpu_dev_ctx =
+        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+    platform::CPUPlace cpu;
+    char* p = reinterpret_cast<char*>(dest);
+    while (size > 0) {
+      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+        return false;
+      }
+      memory::Copy(boost::get<platform::CUDAPlace>(place),
+                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
+                   gpu_dev_ctx.stream());
+      p += size_to_write;
+      size -= size_to_write;
+      input->Skip(size_to_write);
+    }
+    gpu_dev_ctx.Wait();
+#else
+    PADDLE_THROW("Unexpected branch");
+#endif
+    return true;
+  }
+  char* p = reinterpret_cast<char*>(dest);
+  while (size > 0) {
+    if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+      return false;
+    }
+    // TODO(gongwb): can we avoid copy?
+    platform::CPUPlace cpu;
+    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
+    p += size_to_write;
+    size -= size_to_write;
+    input->Skip(size_to_write);
+  }
+  return true;
+}
+bool VariableResponse::CopyLodTensorData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
+  auto var = scope_->FindVar(meta_.varname());
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize(dims);
+  framework::LoD lod;
+  for (int i = 0; i < meta_.lod_level(); ++i) {
+    framework::Vector<size_t> v;
+    for (int j = 0; j < meta_.lod(i).lod_data_size(); ++j) {
+      v.push_back(meta_.lod(i).lod_data(j));
+    }
+    lod.push_back(v);
+  }
+  tensor->set_lod(lod);
+  void* tensor_data =
+      tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
+  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
+    return false;
+  }
+  return true;
+}
+inline framework::DDim GetDims(
+    const ::google::protobuf::RepeatedField<::google::protobuf::int64>& dims) {
+  std::vector<int> vecdims;
+  for (auto& d : dims) {
+    vecdims.push_back(d);
+  }
+  return framework::make_ddim(vecdims);
+}
+bool VariableResponse::CopySelectRowsTensorData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
+  auto var = scope_->FindVar(meta_.varname());
+  auto* slr = var->GetMutable<framework::SelectedRows>();
+  auto* tensor = slr->mutable_value();
+  tensor->Resize(dims);
+  void* tensor_data = tensor->mutable_data(
+      ctx.GetPlace(),
+      paddle::operators::detail::ToTypeIndex(meta_.data_type()));
+  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
+    return false;
+  }
+  return true;
+}
+bool VariableResponse::CopySelectRowsData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, int length) {
+  auto var = scope_->FindVar(meta_.varname());
+  auto* slr = var->GetMutable<framework::SelectedRows>();
+  int64_t* rows_data = slr->mutable_rows()->data();
+  // copy rows CPU data, GPU data will be copied lazily.
+  platform::CPUPlace cpu;
+  if (!ReadRaw(input, ctx, cpu, rows_data, length)) {
+    return false;
+  }
+  return true;
+}
+bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
+                  std::vector<int64_t>* lod) {
+  while (true) {
+    auto p = input->ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+    if (!p.second) {
+      return (tag == 0);
+    }
+    switch (tag) {
+      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
+        uint64_t v;
+        if (wt == WIRETYPE_VARINT) {
+          if (!input->ReadVarint64(&v)) {
+            return false;
+          }
+          lod->push_back(v);
+          break;
+        }
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int length = 0;
+          if (!input->ReadVarintSizeAsInt(&length)) {
+            return tag;
+          }
+          for (int i = 0; i < length; i++) {
+            uint64_t v;
+            if (!input->ReadVarint64(&v)) {
+              return false;
+            }
+            lod->push_back(v);
+          }
+          break;
+        }
+        return false;
+      }
+      default: { return false; }
+    }
+  }
+  return true;
+}
+int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
+  GrpcByteBufferSource source;
+  source.Init(byte_buffer);
+  GrpcByteBufferSourceWrapper r(&source);
+  return Parse(&r);
+}
+int VariableResponse::Parse(Source* source) {
+  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
+      source->contents();
+  ::google::protobuf::io::CodedInputStream input(input_stream);
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
+  while (true) {
+    auto p = input.ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+    if (!p.second) {
+      if (tag != 0) {
+        return -1;
+      }
+      return 0;
+    }
+    switch (tag) {
+      case sendrecv::VariableMessage::kVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+        meta_.set_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kTypeFieldNumber: {
+        uint64_t v;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_type(static_cast<::sendrecv::VarType>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDimsFieldNumber: {
+        // not packed
+        if (wt == WIRETYPE_VARINT) {
+          uint64_t v;
+          if (!input.ReadVarint64(&v)) {
+            return tag;
+          }
+          meta_.add_dims(v);
+          break;
+        }
+        // packed
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int length = 0;
+          if (!input.ReadVarintSizeAsInt(&length)) {
+            return tag;
+          }
+          for (int i = 0; i < length; i++) {
+            uint64_t v;
+            if (!input.ReadVarint64(&v)) {
+              return tag;
+            }
+            meta_.add_dims(v);
+          }
+          break;
+        }
+        return tag;
+      }
+      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_lod_level(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kLodFieldNumber: {
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
+            input.IncrementRecursionDepthAndPushLimit(length);
+        std::vector<int64_t> lod_data;
+        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
+          return tag;
+        }
+        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
+          return false;
+        }
+        if (lod_data.size() == 0) {
+          break;
+        }
+        auto lod = meta_.add_lod();
+        for (uint32_t i = 0; i < lod_data.size(); i++) {
+          lod->add_lod_data(lod_data[i]);
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kSerializedFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+        framework::DDim dims = GetDims(meta_.dims());
+        if (meta_.type() == sendrecv::LOD_TENSOR) {
+          PADDLE_ENFORCE(meta_.lod_size() >= 0,
+                         "lod info should be got first!");
+          if (!CopyLodTensorData(&input, *dev_ctx_, dims, length)) {
+            return tag;
+          }
+          break;
+        }
+        if (meta_.type() == sendrecv::SELECTED_ROWS) {
+          if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, length)) {
+            return tag;
+          }
+          break;
+        }
+        return tag;
+      }
+      case sendrecv::VariableMessage::kRowsFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+        if (!CopySelectRowsData(&input, *dev_ctx_, length)) {
+          return tag;
+        }
+        break;
+      }
+      default: {
+        // Unknown tag, return unknown error.
+        return -1;
+      }
+    }
+  }
+  return 0;
+}
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+class VariableResponse {
+ public:
+  VariableResponse(const framework::Scope* scope,
+                   const platform::DeviceContext* dev_ctx)
+      : scope_(scope), dev_ctx_(dev_ctx) {}
+  virtual ~VariableResponse() {}
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(Source* source);
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(const ::grpc::ByteBuffer& byte_buffer);
+  inline std::string Varname() { return meta_.varname(); }
+  // should call parse first.
+  framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); }
+ private:
+  bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
+                                const platform::DeviceContext& ctx,
+                                framework::DDim& dims, int length);
+  bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
+                          const platform::DeviceContext& ctx, int length);
+  bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
+                         const platform::DeviceContext& ctx,
+                         framework::DDim& dims, int length);
+ private:
+  const framework::Scope* scope_;
+  const platform::DeviceContext* dev_ctx_;
+  // only Skeleton
+  sendrecv::VariableMessage meta_;
+};
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -188,8 +188,8 @@ The general steps are as follows. First, calculate the true positive and
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(detection_map, ops::DetectionMAPOp,
+REGISTER_OPERATOR(detection_map, ops::DetectionMAPOp, ops::DetectionMAPOpMaker,
-                             ops::DetectionMAPOpMaker);
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
    detection_map, ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, float>,
    ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -35,7 +35,6 @@ class DropoutOp : public framework::OperatorWithKernel {
  }
 };
-template <typename AttrType>
 class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
@@ -73,7 +72,6 @@ are set equal to their corresponding inputs.
  }
 };
-template <typename AttrType>
 class DropoutOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -103,11 +101,10 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
+REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad,
-            ops::DropoutOpGrad<float>);
+            ops::DropoutOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    dropout,
+    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
 REGISTER_OP_CPU_KERNEL(
    dropout_grad,
    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -18,20 +18,22 @@ limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
-template <typename T, typename AttrType>
+template <typename T>
 __global__ void RandomGenerator(const size_t n, const int seed,
-                                const AttrType dropout_prob, const T* src,
+                                const float dropout_prob, const T* src,
                                T* mask_data, T* dst) {
  thrust::minstd_rand rng;
  rng.seed(seed);
-  thrust::uniform_real_distribution<AttrType> dist(0, 1);
+  thrust::uniform_real_distribution<float> dist(0, 1);
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
  for (; idx < n; idx += blockDim.x * gridDim.x) {
+    rng.discard(idx);
    if (dist(rng) < dropout_prob) {
      mask_data[idx] = static_cast<T>(0);
    } else {
@@ -44,14 +46,14 @@ __global__ void RandomGenerator(const size_t n, const int seed,
 // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
-template <typename Place, typename T, typename AttrType>
+template <typename Place, typename T>
 class GPUDropoutKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<Tensor>("X");
    auto* y = context.Output<Tensor>("Out");
    y->mutable_data<T>(context.GetPlace());
-    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
+    float dropout_prob = context.Attr<float>("dropout_prob");
    auto X = EigenMatrix<T>::Reshape(*x, 1);
    auto Y = EigenMatrix<T>::Reshape(*y, 1);
@@ -70,11 +72,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
      int threads = 512;
      int grid = (x->numel() + threads - 1) / threads;
-      RandomGenerator<T, AttrType><<<grid, threads, 0,
+      RandomGenerator<
-                                     context.cuda_device_context().stream()>>>(
+          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
          size, seed, dropout_prob, x_data, mask_data, y_data);
    } else {
-      Y.device(place) = X * (1.0f - dropout_prob);
+      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
    }
  }
 };
@@ -83,9 +85,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    dropout,
+    dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>);
+    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
+REGISTER_OP_CUDA_KERNEL(dropout_grad,
-    dropout_grad,
+                        ops::DropoutGradKernel<plat::CUDADeviceContext, float>);
-    ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename DeviceContext, typename T, typename AttrType>
+template <typename DeviceContext, typename T>
 class CPUDropoutKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {

--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -14,19 +14,20 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    elementwise_add,
+    elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
-                                  int64_t>);
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
@@ -28,6 +29,10 @@ class FeedOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &place) const override {
+    // get device context from pool
+    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    platform::RecordEvent record_event(Type(), dev_ctx);
    auto feed_var_name = Input("X");
    auto *feed_var = scope.FindVar(feed_var_name);
@@ -50,14 +55,10 @@ class FeedOp : public framework::OperatorBase {
    auto &feed_item = feed_list.at(static_cast<size_t>(col));
    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
    if (platform::is_same_place(feed_item.place(), place)) {
      out_item->ShareDataWith(feed_item);
    } else {
-      framework::TensorCopy(feed_item, place, dev_ctx, out_item);
+      framework::TensorCopy(feed_item, place, *dev_ctx, out_item);
    }
    out_item->set_lod(feed_item.lod());
  }

--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
@@ -29,6 +30,9 @@ class FetchOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &place) const override {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
    auto fetch_var_name = Input("X");
    auto *fetch_var = scope.FindVar(fetch_var_name);
    PADDLE_ENFORCE(fetch_var != nullptr,
@@ -53,7 +57,6 @@ class FetchOp : public framework::OperatorBase {
    // FIXME(yuyang18): Should we assume the fetch operator always generate
    // CPU outputs?
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(src_item.place());
    TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);

--- a/paddle/fluid/operators/iou_similarity_op.cc
+++ b/paddle/fluid/operators/iou_similarity_op.cc
@@ -87,8 +87,9 @@ $$
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp,
+REGISTER_OPERATOR(iou_similarity, ops::IOUSimilarityOp,
-                             ops::IOUSimilarityOpMaker);
+                  ops::IOUSimilarityOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
    iou_similarity,

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 #include "paddle/fluid/operators/detail/simple_block_queue.h"
@@ -68,9 +69,7 @@ class ListenAndServOp : public framework::OperatorBase {
  }
  void Stop() override {
-    detail::MessageWithName term_msg;
+    rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
-    term_msg.first = LISTEN_TERMINATE_MESSAGE;
-    rpc_service_->Push(term_msg);
    rpc_service_->ShutDown();
    server_thread_->join();
  }
@@ -89,6 +88,10 @@ class ListenAndServOp : public framework::OperatorBase {
    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
    auto *program = block->Program();
+    int num_blocks = program->Size();
+    PADDLE_ENFORCE_GE(num_blocks, 2,
+                      "server program should have at least 2 blocks");
    framework::Executor executor(dev_place);
    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
@@ -103,7 +106,7 @@ class ListenAndServOp : public framework::OperatorBase {
      size_t recv_var_cnt = 0;
      int batch_barrier = 0;
      while (batch_barrier != fan_in) {
-        const detail::MessageWithName &v = rpc_service_->Get();
+        const detail::ReceivedMessage v = rpc_service_->Get();
        auto recv_var_name = v.first;
        if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
          LOG(INFO) << "received terminate message and exit";
@@ -116,12 +119,11 @@ class ListenAndServOp : public framework::OperatorBase {
        } else {
          VLOG(3) << "received grad: " << recv_var_name;
          recv_var_cnt++;
-          auto *var = recv_scope.FindVar(recv_var_name);
+          auto var = v.second->GetVar();
          if (var == nullptr) {
            LOG(ERROR) << "Can not find server side var: " << recv_var_name;
            PADDLE_THROW("Can not find server side var");
          }
-          detail::DeserializeFromMessage(v.second, dev_ctx, var);
          if (var->IsType<framework::SelectedRows>()) {
            sparse_vars.push_back(var);
          }
@@ -132,12 +134,35 @@ class ListenAndServOp : public framework::OperatorBase {
        rpc_service_->ShutDown();
        break;
      }
-      try {
-        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
+      // put optimize blocks in the thread pool to start run, the last block
-                     false /*create_local_scope*/, false /*create_vars*/);
+      // should be global ops.
-      } catch (std::exception &e) {
+      // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
-        LOG(ERROR) << "run sub program error " << e.what();
+      // and this will still work.
+      std::vector<std::future<void>> fs;
+      // block0 contains only listen_and_serv op, start run from block1.
+      for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
+        fs.push_back(
+            framework::Async([&executor, &program, &recv_scope, blkid]() {
+              int run_block = blkid;  // thread local
+              try {
+                executor.Run(*program, &recv_scope, run_block, false, false);
+              } catch (std::exception &e) {
+                LOG(ERROR) << "run sub program error " << e.what();
+              }
+            }));
+      }
+      for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait();
+      // Run global block at final step, or block1 if there are only 2 blocks
+      if (num_blocks >= 2) {
+        try {
+          executor.Run(*program, &recv_scope, num_blocks - 1, false, false);
+        } catch (std::exception &e) {
+          LOG(ERROR) << "run sub program error " << e.what();
+        }
      }
      // Reset the received sparse variables, the sum operator would not
      // sum the input sparse variables which rows is empty at the next
      // mini-batch.
@@ -151,6 +176,10 @@ class ListenAndServOp : public framework::OperatorBase {
      rpc_service_->WaitClientGet(fan_in);
      sparse_vars.clear();
    }  // while(true)
+    // for (int i = 0; i < num_blocks; ++i) {
+    //   delete blk_ctx_list[i];
+    // }
  }
 protected:

--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
@@ -29,6 +30,9 @@ class LoadOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &place) const override {
+    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    platform::RecordEvent record_event(Type(), dev_ctx);
    auto filename = Attr<std::string>("file_path");
    std::ifstream fin(filename);
    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
@@ -41,9 +45,7 @@ class LoadOp : public framework::OperatorBase {
    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    DeserializeFromStream(fin, tensor, *dev_ctx);
-    auto &dev_ctx = *pool.Get(place);
-    DeserializeFromStream(fin, tensor, dev_ctx);
    if (platform::is_gpu_place(place)) {
      // copy CPU to GPU
@@ -55,7 +57,7 @@ class LoadOp : public framework::OperatorBase {
      out_var->Clear();
      tensor = out_var->GetMutable<framework::LoDTensor>();
      tensor->set_lod(cpu_tensor.lod());
-      TensorCopy(cpu_tensor, place, dev_ctx, tensor);
+      TensorCopy(cpu_tensor, place, *dev_ctx, tensor);
    }
  }
 };

--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -22,17 +22,16 @@ class LoDResetOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of LoDResetOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of LoDResetOp should not be null.");
-    // If target LoD is not set form Input(), then it must be set from Attr().
-    if (!ctx->HasInput("TargetLoD")) {
+    if (!ctx->HasInput("Y")) {
      auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
-      PADDLE_ENFORCE(level0.size() > 1,
+      PADDLE_ENFORCE_GT(level0.size(), 1,
-                     "Target LoD is not found, should be set to be a valid one "
+                        "If Input(Y) not provided, the target lod should be "
-                     "through Input() or Attr().");
+                        "specified by attribute `target_lod`.");
    }
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
  }
@@ -50,36 +49,77 @@ class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
+    AddInput("X",
-    AddInput("TargetLoD",
+             "(Tensor, LoDTensor) Input variable of LoDResetOp which "
-             "(Tensor, optional) The target level 0 LoD from Input().")
+             "could be a Tensor or LoDTensor, where the data of output "
+             "variable inherits from.");
+    AddInput("Y",
+             "(Tensor, LoDTensor, optional) If provided and Y is LoDTensor, "
+             "lod of Input(Y) would be considered as the target lod first, "
+             "otherwise data of Input(Y) would be considered as the "
+             "target lod.")
        .AsDispensable();
-    AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator.");
+    AddOutput("Out",
+              "(LoDTensor) Output variable of LoDResetOp which should be a "
+              "LoDTensor.");
    AddAttr<std::vector<int>>("target_lod",
                              "The target level 0 LoD from Attr().")
        .SetDefault(std::vector<int>{});
    AddComment(R"DOC(LoDReset operator
-Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or
+Set LoD of `X` to a new one specified by `Y` or attribute `target_lod`. When `Y`
-Attr(target_lod), or set LoD for Input(X) if it doesn't have one.
+provided and `Y` is a LoDTensor, `Y.lod` would be considered as target LoD
-Currently the lod_reset operator only supports the reset of level 0 LoD.
+first, otherwise `Y.data` would be considered as target LoD. If `Y` is not
-At least one of Input(TargetLoD) and Attr(target_lod) must be set,
+provided, target LoD should be specified by attribute `target_lod`.
-and if both of them are set, Input(TargetLoD) will be chosen as the
+If target LoD is specified by `Y.data` or `target_lod`, only one level LoD
-target LoD.
+is supported.
+Example 1:
+Given a 1-level LoDTensor input(X):
+    X.lod =  [[ 0,     2,                   5      6 ]]
+    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    X.dims = [6, 1]
+attr(target_lod): [0, 4, 6]
+then we get a 1-level LoDTensor:
+    Out.lod =  [[ 0,                   4,            6 ]]
+    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    Out.dims = [6, 1]
+Example 2:
-An example:
+Given a 1-level LoDTensor input(X):
-Given a float LoDTensor X with shape (6, 1), its transpose form represents
+    X.lod =  [[ 0,     2,                   5      6 ]]
+    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    X.dims = [6, 1]
-    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+input(Y) is a Tensor:
+    Y.data = [[0, 2, 6]]
+    Y.dims = [1, 3]
-with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like
+then we get a 1-level LoDTensor:
+    Out.lod =  [[ 0,     2,                          6 ]]
+    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    Out.dims = [6, 1]
-    [1.0, 2.0], [3.0, 4.0, 5.0], [6.0].
+Example 3:
-If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and
+Given a 1-level LoDTensor input(X):
-the sequences that the LoDTensor Output(Out) contains becomes:
+    X.lod =  [[ 0,      2,                   5     6 ]]
+    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    X.dims = [6, 1]
-    [1.0, 2.0, 3.0, 4.0], [5.0, 6.0].
+input(Y) is a 2-level LoDTensor:
+    Y.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+    Y.data = [[1.1], [2.1], [3.1], [4.1], [5.1], [6.1]]
+    Y.dims = [6, 1]
+then we get a 2-level LoDTensor:
+    Out.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    Out.dims = [6, 1]
 )DOC");
  }
@@ -90,10 +130,16 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LoDResetGradOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
+                   "Input(Out@Grad) of LoDResetGradOp should not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
+    }
  }
 protected:
@@ -111,9 +157,13 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
            ops::LoDResetGradOp);
-REGISTER_OP_CPU_KERNEL(lod_reset,
+REGISTER_OP_CPU_KERNEL(
-                       ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
+    lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
-                       ops::LoDResetKernel<paddle::platform::CPUPlace, double>);
+    ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
+    ops::LoDResetKernel<paddle::platform::CPUPlace, int>,
+    ops::LoDResetKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OP_CPU_KERNEL(
    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
-    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>);
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, int>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/fluid/operators/lod_reset_op.cu
+++ b/paddle/fluid/operators/lod_reset_op.cu
@@ -18,8 +18,12 @@ namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    lod_reset, ops::LoDResetKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
    lod_reset_grad,
    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,6 +6,7 @@ function(math_library TARGET)
    # But it handle split GPU/CPU code and link some common library.
    set(cc_srcs)
    set(cu_srcs)
+    set(hip_srcs)
    set(math_common_deps device_context framework_proto)
    set(multiValueArgs DEPS)
    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
@@ -17,10 +18,15 @@ function(math_library TARGET)
    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
        list(APPEND cu_srcs ${TARGET}.cu)
    endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+        list(APPEND hip_srcs ${TARGET}.hip.cu)
+    endif()
    list(LENGTH cc_srcs cc_srcs_len)
    if (WITH_GPU)
        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
    elseif(${cc_srcs_len} GREATER 0)
        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
    endif()
@@ -43,7 +49,7 @@ math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
-math_library(softmax)
+math_library(softmax DEPS math_function)
 math_library(unpooling)
 math_library(vol2col)

--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
--- a/paddle/fluid/operators/math/concat.hip.cu
+++ b/paddle/fluid/operators/math/concat.hip.cu
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
--- a/paddle/fluid/operators/mkldnn_activation_op.h
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/target_assign_op.cc
+++ b/paddle/fluid/operators/target_assign_op.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/tools/build_docs/.gitignore
+++ b/paddle/scripts/tools/build_docs/.gitignore
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
--- a/python/paddle/fluid/debuger.py
+++ b/python/paddle/fluid/debuger.py
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/fluid/tests/unittests/.gitignore
+++ b/python/paddle/fluid/tests/unittests/.gitignore
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
--- a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
--- a/python/paddle/fluid/tests/unittests/test_multiple_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multiple_reader.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py